Compare commits

...

3 Commits

Author SHA1 Message Date
Nathan McRae
6cea9b7e59 Add comment parsing 2024-02-14 18:31:58 -08:00
Nathan McRae
f740e4bad1 Add readme 2024-02-14 16:16:53 -08:00
Nathan McRae
404e308382 Add different types of parsing 2024-02-14 16:16:23 -08:00
3 changed files with 167 additions and 23 deletions

View File

@ -20,13 +20,37 @@ public class SaneTsv
BINARY,
}
protected enum FormatType
{
SANE_TSV = 0,
TYPED_TSV = 1,
COMMENTED_TSV = 2,
}
// TODO: We need to be able to update all these in tandem somehow
public string[] ColumnNames { get; protected set; }
public ColumnType[] ColumnTypes { get; protected set; }
public Dictionary<string, List<object>> Columns { get; protected set; }
public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null;
public static SaneTsv Parse(byte[] inputBuffer)
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
{
return Parse(inputBuffer, FormatType.SANE_TSV);
}
public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
{
return Parse(inputBuffer, FormatType.TYPED_TSV);
}
public static SaneTsv ParseCommentedTsv(byte[] inputBuffer)
{
return Parse(inputBuffer, FormatType.COMMENTED_TSV);
}
// TODO: Have parsing errors include line / column #
protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
{
var parsed = new SaneTsv();
parsed.Columns = new Dictionary<string, List<object>>();
@ -36,7 +60,11 @@ public class SaneTsv
var fieldBytes = new List<byte>();
var fields = new List<byte[]>();
var currentComment = new StringBuilder();
int numFields = -1;
int line = 1;
int currentLineStart = 0;
for (int i = 0; i < inputBuffer.Count(); i++)
{
if (inputBuffer[i] == '\\')
@ -60,6 +88,11 @@ public class SaneTsv
fieldBytes.Add((byte)'\t');
i++;
}
else if (inputBuffer[i + 1] == '#')
{
fieldBytes.Add((byte)'#');
i++;
}
else
{
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
@ -101,12 +134,20 @@ public class SaneTsv
string columnTypeString;
string columnName;
if (columnString.Contains(":")) {
if (columnString.Contains(':')) {
if (format == FormatType.SANE_TSV)
{
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
}
columnTypeString = columnString.Split(":").Last();
columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1);
}
else
{
if (format > FormatType.SANE_TSV)
{
throw new Exception($"Header {fields.Count} has no type");
}
columnTypeString = "";
columnName = columnString;
}
@ -163,20 +204,57 @@ public class SaneTsv
parsed.ColumnTypes[j] = type;
}
if (numTypesBlank != 0 && numTypesBlank != fields.Count)
if (currentComment.Length > 0)
{
throw new Exception("Types must be provided for all columns or none. Use 'string' for columns missing types.");
parsed.FileComment = currentComment.ToString();
currentComment.Clear();
}
fields.Clear();
}
else if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
AddRecord(parsed, fields);
string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
}
line++;
currentLineStart = i + 1;
}
else if (inputBuffer[i] == '#')
{
if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
{
int j = i;
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
if (j < inputBuffer.Length)
{
var commentBytes = new byte[j - i - 1];
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
currentComment.Append("\n");
i = j;
currentLineStart = i + 1;
line++;
}
else
{
throw new Exception("Comments at end of file are not allowed");
}
}
else
{
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
}
}
else
@ -197,13 +275,20 @@ public class SaneTsv
}
else
{
AddRecord(parsed, fields);
string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
}
return parsed;
}
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields)
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
{
var parsedFields = new object[fields.Count];
for (int j = 0; j < fields.Count; j++)
@ -223,7 +308,7 @@ public class SaneTsv
}
catch (Exception e)
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UTF-8", e);
throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
}
switch (parsed.ColumnTypes[j])
@ -244,7 +329,7 @@ public class SaneTsv
}
else
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly");
throw new Exception($"Field {j} on line {line} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly");
}
parsedFields[j] = parsedBool;
@ -253,7 +338,7 @@ public class SaneTsv
case ColumnType.FLOAT32:
if (!float.TryParse(fieldString, out float parsedFloat))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid single-precision float");
throw new Exception($"Field {j} on line {line} is not valid single-precision float");
}
parsedFields[j] = parsedFloat;
@ -262,7 +347,7 @@ public class SaneTsv
case ColumnType.FLOAT64:
if (!double.TryParse(fieldString, out double parsedDouble))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid double-precision float");
throw new Exception($"Field {j} on line {line} is not valid double-precision float");
}
parsedFields[j] = parsedDouble;
@ -271,7 +356,7 @@ public class SaneTsv
case ColumnType.UINT32:
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt32");
throw new Exception($"Field {j} on line {line} is not valid UInt32");
}
parsedFields[j] = parsedUInt32;
@ -280,7 +365,7 @@ public class SaneTsv
case ColumnType.UINT64:
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt64");
throw new Exception($"Field {j} on line {line} is not valid UInt64");
}
parsedFields[j] = parsedUInt64;
@ -289,7 +374,7 @@ public class SaneTsv
case ColumnType.INT32:
if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int32");
throw new Exception($"Field {j} on line {line} is not valid Int32");
}
parsedFields[j] = parsedInt32;
@ -298,7 +383,7 @@ public class SaneTsv
case ColumnType.INT64:
if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int64");
throw new Exception($"Field {j} on line {line} is not valid Int64");
}
parsedFields[j] = parsedInt64;
@ -311,8 +396,7 @@ public class SaneTsv
}
}
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
fields.Clear();
return parsedFields;
}
public SaneTsvRecord this[int i] => Records[i];
@ -320,14 +404,16 @@ public class SaneTsv
public class SaneTsvRecord
{
public SaneTsv Parent { get; }
public string Comment { get; }
public object[] Fields { get; }
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public SaneTsvRecord(SaneTsv parent, object[] fields)
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
{
Parent = parent;
Fields = fields;
Comment = comment;
}
}
}

View File

@ -3,12 +3,12 @@ using System.Text;
{
string testName = "Bool test";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
string testString1 = "column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
if (parsed.Records[0]["column1:type"] is bool result && result)
SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1));
if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
{
Console.WriteLine($"Passed {testName}");
}
@ -26,7 +26,7 @@ using System.Text;
"\nTUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName}");
}
catch (Exception)

58
SaneTsv/readme.md Normal file
View File

@ -0,0 +1,58 @@
# Sane TSV
Sane TSV is a strict format for tabular data.
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
'\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionaly, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv).
All fields must be UTF-8 encoded text. All escaping can be done before decoding (and after encoding).
Empty fields (i.e. two subsequent '\t' characters) are allowed.
The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
All lines in the file must have the same number of fields.
The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
Implementations of the format do not need to handle file reading and writing directly, but if they do, they should enforce usage of the file extension '.stsv'. They should also provide a manual override option so that other extensions may be forced.
# Typed TSV
Typed TSV allows for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
- 'string'
- 'boolean'
- 'float32'
- 'float64'
- 'uint32'
- 'uint64'
- 'int32'
- 'int64'
- 'binary'
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
All fields in the rest of the file must be of the type corresponding the their column.
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
- 'boolean' fields must contain only and exactly the text "TRUE" or "FALSE".
- 'float32' and 'float64' correspond to single and double precision IEEE 754 floating-point numbers respectively. They should be formatted like this regex: `-?[0-9]\.([0-9]|[0-9]+[1-9])E-?[1-9][0-9]*`
Both float types may additionally have these values:
- 'sNaN'
- 'qNaN'
- '+inf'
- '-inf'
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
# Commented TSV
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
Comments must be UTF-8 encoded text.
Comments after the last record are an error.