Compare commits
3 Commits
ab72d875bf
...
6cea9b7e59
Author | SHA1 | Date | |
---|---|---|---|
|
6cea9b7e59 | ||
|
f740e4bad1 | ||
|
404e308382 |
@ -20,13 +20,37 @@ public class SaneTsv
|
||||
BINARY,
|
||||
}
|
||||
|
||||
protected enum FormatType
|
||||
{
|
||||
SANE_TSV = 0,
|
||||
TYPED_TSV = 1,
|
||||
COMMENTED_TSV = 2,
|
||||
}
|
||||
|
||||
// TODO: We need to be able to update all these in tandem somehow
|
||||
public string[] ColumnNames { get; protected set; }
|
||||
public ColumnType[] ColumnTypes { get; protected set; }
|
||||
public Dictionary<string, List<object>> Columns { get; protected set; }
|
||||
public List<SaneTsvRecord> Records { get; protected set; }
|
||||
public string FileComment { get; protected set; } = null;
|
||||
|
||||
public static SaneTsv Parse(byte[] inputBuffer)
|
||||
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
|
||||
{
|
||||
return Parse(inputBuffer, FormatType.SANE_TSV);
|
||||
}
|
||||
|
||||
public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
|
||||
{
|
||||
return Parse(inputBuffer, FormatType.TYPED_TSV);
|
||||
}
|
||||
|
||||
public static SaneTsv ParseCommentedTsv(byte[] inputBuffer)
|
||||
{
|
||||
return Parse(inputBuffer, FormatType.COMMENTED_TSV);
|
||||
}
|
||||
|
||||
// TODO: Have parsing errors include line / column #
|
||||
protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
|
||||
{
|
||||
var parsed = new SaneTsv();
|
||||
parsed.Columns = new Dictionary<string, List<object>>();
|
||||
@ -36,7 +60,11 @@ public class SaneTsv
|
||||
|
||||
var fieldBytes = new List<byte>();
|
||||
var fields = new List<byte[]>();
|
||||
var currentComment = new StringBuilder();
|
||||
|
||||
int numFields = -1;
|
||||
int line = 1;
|
||||
int currentLineStart = 0;
|
||||
for (int i = 0; i < inputBuffer.Count(); i++)
|
||||
{
|
||||
if (inputBuffer[i] == '\\')
|
||||
@ -60,6 +88,11 @@ public class SaneTsv
|
||||
fieldBytes.Add((byte)'\t');
|
||||
i++;
|
||||
}
|
||||
else if (inputBuffer[i + 1] == '#')
|
||||
{
|
||||
fieldBytes.Add((byte)'#');
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
|
||||
@ -101,12 +134,20 @@ public class SaneTsv
|
||||
|
||||
string columnTypeString;
|
||||
string columnName;
|
||||
if (columnString.Contains(":")) {
|
||||
if (columnString.Contains(':')) {
|
||||
if (format == FormatType.SANE_TSV)
|
||||
{
|
||||
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
|
||||
}
|
||||
columnTypeString = columnString.Split(":").Last();
|
||||
columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (format > FormatType.SANE_TSV)
|
||||
{
|
||||
throw new Exception($"Header {fields.Count} has no type");
|
||||
}
|
||||
columnTypeString = "";
|
||||
columnName = columnString;
|
||||
}
|
||||
@ -163,20 +204,57 @@ public class SaneTsv
|
||||
parsed.ColumnTypes[j] = type;
|
||||
}
|
||||
|
||||
if (numTypesBlank != 0 && numTypesBlank != fields.Count)
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
throw new Exception("Types must be provided for all columns or none. Use 'string' for columns missing types.");
|
||||
parsed.FileComment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
|
||||
fields.Clear();
|
||||
}
|
||||
else if (numFields != fields.Count)
|
||||
{
|
||||
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
|
||||
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
|
||||
}
|
||||
else
|
||||
{
|
||||
AddRecord(parsed, fields);
|
||||
string comment = null;
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
line++;
|
||||
currentLineStart = i + 1;
|
||||
}
|
||||
else if (inputBuffer[i] == '#')
|
||||
{
|
||||
if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
|
||||
{
|
||||
int j = i;
|
||||
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
|
||||
if (j < inputBuffer.Length)
|
||||
{
|
||||
var commentBytes = new byte[j - i - 1];
|
||||
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
|
||||
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
|
||||
currentComment.Append("\n");
|
||||
i = j;
|
||||
currentLineStart = i + 1;
|
||||
line++;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception("Comments at end of file are not allowed");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -197,13 +275,20 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
AddRecord(parsed, fields);
|
||||
string comment = null;
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields)
|
||||
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
||||
{
|
||||
var parsedFields = new object[fields.Count];
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
@ -223,7 +308,7 @@ public class SaneTsv
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UTF-8", e);
|
||||
throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
|
||||
}
|
||||
|
||||
switch (parsed.ColumnTypes[j])
|
||||
@ -244,7 +329,7 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly");
|
||||
throw new Exception($"Field {j} on line {line} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedBool;
|
||||
@ -253,7 +338,7 @@ public class SaneTsv
|
||||
case ColumnType.FLOAT32:
|
||||
if (!float.TryParse(fieldString, out float parsedFloat))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid single-precision float");
|
||||
throw new Exception($"Field {j} on line {line} is not valid single-precision float");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedFloat;
|
||||
@ -262,7 +347,7 @@ public class SaneTsv
|
||||
case ColumnType.FLOAT64:
|
||||
if (!double.TryParse(fieldString, out double parsedDouble))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid double-precision float");
|
||||
throw new Exception($"Field {j} on line {line} is not valid double-precision float");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedDouble;
|
||||
@ -271,7 +356,7 @@ public class SaneTsv
|
||||
case ColumnType.UINT32:
|
||||
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt32");
|
||||
throw new Exception($"Field {j} on line {line} is not valid UInt32");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedUInt32;
|
||||
@ -280,7 +365,7 @@ public class SaneTsv
|
||||
case ColumnType.UINT64:
|
||||
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt64");
|
||||
throw new Exception($"Field {j} on line {line} is not valid UInt64");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedUInt64;
|
||||
@ -289,7 +374,7 @@ public class SaneTsv
|
||||
case ColumnType.INT32:
|
||||
if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int32");
|
||||
throw new Exception($"Field {j} on line {line} is not valid Int32");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedInt32;
|
||||
@ -298,7 +383,7 @@ public class SaneTsv
|
||||
case ColumnType.INT64:
|
||||
if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int64");
|
||||
throw new Exception($"Field {j} on line {line} is not valid Int64");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedInt64;
|
||||
@ -311,8 +396,7 @@ public class SaneTsv
|
||||
}
|
||||
}
|
||||
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
|
||||
fields.Clear();
|
||||
return parsedFields;
|
||||
}
|
||||
|
||||
public SaneTsvRecord this[int i] => Records[i];
|
||||
@ -320,14 +404,16 @@ public class SaneTsv
|
||||
public class SaneTsvRecord
|
||||
{
|
||||
public SaneTsv Parent { get; }
|
||||
public string Comment { get; }
|
||||
public object[] Fields { get; }
|
||||
|
||||
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
||||
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields)
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
|
||||
{
|
||||
Parent = parent;
|
||||
Fields = fields;
|
||||
Comment = comment;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,12 +3,12 @@ using System.Text;
|
||||
|
||||
{
|
||||
string testName = "Bool test";
|
||||
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
|
||||
string testString1 = "column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
|
||||
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
|
||||
"\nFALSE\tnother\tno\\ther";
|
||||
|
||||
SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
|
||||
if (parsed.Records[0]["column1:type"] is bool result && result)
|
||||
SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1));
|
||||
if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
|
||||
{
|
||||
Console.WriteLine($"Passed {testName}");
|
||||
}
|
||||
@ -26,7 +26,7 @@ using System.Text;
|
||||
"\nTUE\tvalue\\\\t\0woo\tvaluetrhee" +
|
||||
"\nFALSE\tnother\tno\\ther";
|
||||
|
||||
SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
|
||||
SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1));
|
||||
Console.WriteLine($"Failed {testName}");
|
||||
}
|
||||
catch (Exception)
|
||||
|
58
SaneTsv/readme.md
Normal file
58
SaneTsv/readme.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Sane TSV
|
||||
|
||||
Sane TSV is a strict format for tabular data.
|
||||
|
||||
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
|
||||
|
||||
'\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionaly, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv).
|
||||
|
||||
All fields must be UTF-8 encoded text. All escaping can be done before decoding (and after encoding).
|
||||
|
||||
Empty fields (i.e. two subsequent '\t' characters) are allowed.
|
||||
|
||||
The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
|
||||
|
||||
All lines in the file must have the same number of fields.
|
||||
|
||||
The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
|
||||
|
||||
Implementations of the format do not need to handle file reading and writing directly, but if they do, they should enforce usage of the file extension '.stsv'. They should also provide a manual override option so that other extensions may be forced.
|
||||
|
||||
# Typed TSV
|
||||
|
||||
Typed TSV allows for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
|
||||
|
||||
- 'string'
|
||||
- 'boolean'
|
||||
- 'float32'
|
||||
- 'float64'
|
||||
- 'uint32'
|
||||
- 'uint64'
|
||||
- 'int32'
|
||||
- 'int64'
|
||||
- 'binary'
|
||||
|
||||
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
|
||||
|
||||
All fields in the rest of the file must be of the type corresponding the their column.
|
||||
|
||||
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
|
||||
|
||||
- 'boolean' fields must contain only and exactly the text "TRUE" or "FALSE".
|
||||
- 'float32' and 'float64' correspond to single and double precision IEEE 754 floating-point numbers respectively. They should be formatted like this regex: `-?[0-9]\.([0-9]|[0-9]+[1-9])E-?[1-9][0-9]*`
|
||||
|
||||
Both float types may additionally have these values:
|
||||
- 'sNaN'
|
||||
- 'qNaN'
|
||||
- '+inf'
|
||||
- '-inf'
|
||||
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
|
||||
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
|
||||
|
||||
# Commented TSV
|
||||
|
||||
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
|
||||
|
||||
Comments must be UTF-8 encoded text.
|
||||
|
||||
Comments after the last record are an error.
|
Loading…
Reference in New Issue
Block a user