Compare commits

...

10 Commits

Author SHA1 Message Date
d9ef2a4bb6 Update roadmap 2024-03-16 09:51:26 -07:00
a80206767e Change column type management
The column types were tracked just as a Type. This changes them to be an instance
so they can track additional information (such as the specific units of a physical
units type). Because of this, the column type attribute need to be passed as strings
(see CS0181).
2024-03-10 22:28:05 -07:00
b8ae3ce65d Fix file comment serialization 2024-03-10 22:16:04 -07:00
0fd092685d Remove unused SaneTsv fields 2024-03-10 12:55:21 -07:00
55fa00a6e7 Clarify terminology of 'header'/'column'
The header is just the region containing the column name / types.
2024-03-09 13:13:41 -08:00
d428af51bb Flesh out tests 2024-03-09 09:58:25 -08:00
aef92e87d4 Update some comments 2024-03-09 09:58:09 -08:00
b56236cbb7 Fix some end-of-file simple TSV parsing 2024-03-09 09:57:56 -08:00
7230f982ac Fix some minor line/column numbering issues 2024-03-09 09:57:27 -08:00
f4145bacd2 Fix column attribute bug 2024-03-09 09:57:00 -08:00
3 changed files with 675 additions and 153 deletions

View File

@ -43,9 +43,6 @@ public class SaneTsv
public static readonly byte[] TrueEncoded = Encoding.UTF8.GetBytes("TRUE"); public static readonly byte[] TrueEncoded = Encoding.UTF8.GetBytes("TRUE");
public static readonly byte[] FalseEncoded = Encoding.UTF8.GetBytes("FALSE"); public static readonly byte[] FalseEncoded = Encoding.UTF8.GetBytes("FALSE");
// TODO: We need to be able to update all these in tandem somehow
public string[] ColumnNames { get; protected set; }
public Type[] ColumnTypes { get; protected set; }
protected static bool? _littleEndian = null; protected static bool? _littleEndian = null;
public static bool LittleEndian public static bool LittleEndian
{ {
@ -71,7 +68,6 @@ public class SaneTsv
public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new() public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new()
{ {
// TODO: add the file comment?
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV); return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
} }
@ -88,22 +84,22 @@ public class SaneTsv
} }
parsed.Records = new List<T>(); parsed.Records = new List<T>();
var headerTypes = new List<Type>(); var columnTypes = new List<ColumnType>();
var headerNames = new List<string>(); var columnNames = new List<string>();
var headerPropertyInfos = new List<PropertyInfo>(); var columnPropertyInfos = new List<PropertyInfo>();
int columnCount = 0; int columnCount = 0;
foreach (PropertyInfo property in typeof(T).GetProperties()) foreach (PropertyInfo property in typeof(T).GetProperties())
{ {
TypedTsvColumnAttribute attribute = (TypedTsvColumnAttribute)Attribute.GetCustomAttribute(property, typeof(TypedTsvColumnAttribute)); TsvColumnAttribute attribute = (TsvColumnAttribute)Attribute.GetCustomAttribute(property, typeof(TsvColumnAttribute));
if (attribute == null) if (attribute == null)
{ {
continue; continue;
} }
headerNames.Add(attribute.ColumnName ?? property.Name); columnNames.Add(attribute.ColumnName ?? property.Name);
headerTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType)); columnTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType));
headerPropertyInfos.Add(property); columnPropertyInfos.Add(property);
// TODO: Check that the property type and given column type are compatible // TODO: Check that the property type and given column type are compatible
columnCount++; columnCount++;
} }
@ -170,7 +166,7 @@ public class SaneTsv
} }
catch (Exception e) catch (Exception e)
{ {
throw new Exception($"Header {fields.Count} is not valid UTF-8", e); throw new Exception($"Header field {fields.Count} is not valid UTF-8", e);
} }
string columnTypeString; string columnTypeString;
@ -179,7 +175,7 @@ public class SaneTsv
{ {
if (format == FormatType.SIMPLE_TSV) if (format == FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); throw new Exception($"Header field {j} contains ':', which is not allowed for column names");
} }
columnTypeString = columnString.Split(":").Last(); columnTypeString = columnString.Split(":").Last();
columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1);
@ -188,52 +184,52 @@ public class SaneTsv
{ {
if (format > FormatType.SIMPLE_TSV) if (format > FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} has no type"); throw new Exception($"Header field {fields.Count} has no type");
} }
columnTypeString = ""; columnTypeString = "";
columnName = columnString; columnName = columnString;
} }
Type type; ColumnType type;
switch (columnTypeString) switch (columnTypeString)
{ {
case "": case "":
numTypesBlank++; numTypesBlank++;
type = typeof(StringType); type = new StringType();
break; break;
case "string": case "string":
type = typeof(StringType); type = new StringType();
break; break;
case "boolean": case "boolean":
type = typeof(BooleanType); type = new BooleanType();
break; break;
case "float32": case "float32":
type = typeof(Float32Type); type = new Float32Type();
break; break;
case "float32-le": case "float32-le":
type = typeof(Float32LEType); type = new Float32LEType();
break; break;
case "float64": case "float64":
type = typeof(Float64Type); type = new Float64Type();
break; break;
case "float64-le": case "float64-le":
type = typeof(Float64LEType); type = new Float64LEType();
break; break;
case "uint32": case "uint32":
type = typeof(UInt32Type); type = new UInt32Type();
break; break;
case "uint64": case "uint64":
type = typeof(UInt64Type); type = new UInt64Type();
break; break;
case "int32": case "int32":
type = typeof(Int32Type); type = new Int32Type();
break; break;
case "int64": case "int64":
type = typeof(Int64Type); type = new Int64Type();
break; break;
case "binary": case "binary":
type = typeof(BinaryType); type = new BinaryType();
break; break;
default: default:
throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
@ -241,14 +237,14 @@ public class SaneTsv
// TODO: Allow lax parsing (only worry about parsing columns that are given in the specifying type // TODO: Allow lax parsing (only worry about parsing columns that are given in the specifying type
if (headerNames[j] != columnName) if (columnNames[j] != columnName)
{ {
throw new Exception($"Column {j} has name {columnName}, but expected {headerNames[j]}"); throw new Exception($"Column {j} has name {columnName}, but expected {columnNames[j]}");
} }
if (headerTypes[j] != type) if (columnTypes[j].GetType() != type.GetType())
{ {
throw new Exception($"Column {j} has type {type}, but expected {headerTypes[j]}"); throw new Exception($"Column {j} has type {type}, but expected {columnTypes[j]}");
} }
} }
@ -312,7 +308,7 @@ public class SaneTsv
// Complication: it probably depends on processor count // Complication: it probably depends on processor count
if (inputBuffer.Length < 10000) if (inputBuffer.Length < 10000)
{ {
parsed.Records.AddRange(Parse<T>(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), currentLineStart - 1, inputBuffer.Length)); parsed.Records.AddRange(Parse<T>(inputBuffer, format, columnPropertyInfos.ToArray(), columnTypes.ToArray(), currentLineStart - 1, inputBuffer.Length));
return parsed; return parsed;
} }
else else
@ -334,9 +330,10 @@ public class SaneTsv
endIndex = (i + 1) * splitCount + parseStart; endIndex = (i + 1) * splitCount + parseStart;
} }
parsedValues[i] = Parse<T>(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), startIndex, endIndex); parsedValues[i] = Parse<T>(inputBuffer, format, columnPropertyInfos.ToArray(), columnTypes.ToArray(), startIndex, endIndex);
}); });
// TODO: Handle relative line numbers
for (int i = 0; i < tasks; i++) for (int i = 0; i < tasks; i++)
{ {
parsed.Records.AddRange(parsedValues[i]); parsed.Records.AddRange(parsedValues[i]);
@ -351,7 +348,7 @@ public class SaneTsv
// startIndex is in we'd have to go back to the start of the record's comment, and to know // startIndex is in we'd have to go back to the start of the record's comment, and to know
// exactly where that comment started we'd have to go back to the start of the record before that // exactly where that comment started we'd have to go back to the start of the record before that
// (not including that other record's comment). // (not including that other record's comment).
protected static T[] Parse<T>(byte[] inputBuffer, FormatType format, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, int startIndex, int endIndex) where T : TsvRecord, new() protected static T[] Parse<T>(byte[] inputBuffer, FormatType format, PropertyInfo[] columnPropertyInfos, ColumnType[] columnTypes, int startIndex, int endIndex) where T : TsvRecord, new()
{ {
var fieldBytes = new List<byte>(); var fieldBytes = new List<byte>();
var fields = new List<byte[]>(); var fields = new List<byte[]>();
@ -421,9 +418,9 @@ public class SaneTsv
fields.Add(fieldBytes.ToArray()); fields.Add(fieldBytes.ToArray());
fieldBytes.Clear(); fieldBytes.Clear();
if (headerTypes.Length != fields.Count) if (columnTypes.Length != fields.Count)
{ {
throw new Exception($"Expected {headerTypes.Length} fields on line {relativeLine}, but found {fields.Count}"); throw new Exception($"Expected {columnTypes.Length} fields on line {relativeLine}, but found {fields.Count}");
} }
else else
{ {
@ -433,7 +430,7 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Add(ParseCurrentRecord<T>(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, relativeLine)); parsed.Add(ParseCurrentRecord<T>(columnTypes.ToArray(), columnPropertyInfos.ToArray(), fields, comment, relativeLine));
fields.Clear(); fields.Clear();
} }
@ -493,9 +490,9 @@ public class SaneTsv
// TODO // TODO
throw new Exception("Not sure when this will happen. THis might actuall be fine"); throw new Exception("Not sure when this will happen. THis might actuall be fine");
} }
if (fields.Count != headerTypes.Length) if (fields.Count != columnTypes.Length)
{ {
throw new Exception($"Expected {headerTypes} fields on line {relativeLine}, but found {fields.Count}"); throw new Exception($"Expected {columnTypes} fields on line {relativeLine}, but found {fields.Count}");
} }
else else
{ {
@ -505,19 +502,19 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Add(ParseCurrentRecord<T>(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, relativeLine)); parsed.Add(ParseCurrentRecord<T>(columnTypes.ToArray(), columnPropertyInfos.ToArray(), fields, comment, relativeLine));
fields.Clear(); fields.Clear();
} }
return parsed.ToArray(); return parsed.ToArray();
} }
protected static T ParseCurrentCommentedRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new() protected static T ParseCurrentCommentedRecord<T>(ColumnType[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
{ {
return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line); return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
} }
protected static T ParseCurrentRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : TsvRecord, new() protected static T ParseCurrentRecord<T>(ColumnType[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : TsvRecord, new()
{ {
T record = new T(); T record = new T();
@ -535,7 +532,7 @@ public class SaneTsv
for (int j = 0; j < fields.Count; j++) for (int j = 0; j < fields.Count; j++)
{ {
// All other types require the content to be UTF-8. Binary fields can ignore that. // All other types require the content to be UTF-8. Binary fields can ignore that.
if (columnTypes[j] == typeof(BinaryType)) if (columnTypes[j].GetType() == typeof(BinaryType))
{ {
// TODO: Use faster method for property setting // TODO: Use faster method for property setting
// e.g. https://blog.marcgravell.com/2012/01/playing-with-your-member.html // e.g. https://blog.marcgravell.com/2012/01/playing-with-your-member.html
@ -544,7 +541,7 @@ public class SaneTsv
properties[j].SetValue(record, fields[j]); properties[j].SetValue(record, fields[j]);
continue; continue;
} }
else if (columnTypes[j] == typeof(Float32LEType)) else if (columnTypes[j].GetType() == typeof(Float32LEType))
{ {
byte[] floatBytes; byte[] floatBytes;
if (!LittleEndian) if (!LittleEndian)
@ -563,7 +560,7 @@ public class SaneTsv
continue; continue;
} }
else if (columnTypes[j] == typeof(Float64LEType)) else if (columnTypes[j].GetType() == typeof(Float64LEType))
{ {
byte[] floatBytes; byte[] floatBytes;
if (!LittleEndian) if (!LittleEndian)
@ -595,11 +592,11 @@ public class SaneTsv
// TODO: Add checking for numeric types format // TODO: Add checking for numeric types format
if (columnTypes[j] == typeof(StringType)) if (columnTypes[j].GetType() == typeof(StringType))
{ {
properties[j].SetValue(record, fieldString); properties[j].SetValue(record, fieldString);
} }
else if (columnTypes[j] == typeof(BooleanType)) else if (columnTypes[j].GetType() == typeof(BooleanType))
{ {
bool parsedBool; bool parsedBool;
if (fieldString == "TRUE") if (fieldString == "TRUE")
@ -617,7 +614,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedBool); properties[j].SetValue(record, parsedBool);
} }
else if (columnTypes[j] == typeof(Float32Type)) else if (columnTypes[j].GetType() == typeof(Float32Type))
{ {
float parsedFloat; float parsedFloat;
if (!float.TryParse(fieldString, out parsedFloat)) if (!float.TryParse(fieldString, out parsedFloat))
@ -638,7 +635,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedFloat); properties[j].SetValue(record, parsedFloat);
} }
else if (columnTypes[j] == typeof(Float64Type)) else if (columnTypes[j].GetType() == typeof(Float64Type))
{ {
double parsedDouble; double parsedDouble;
if (!double.TryParse(fieldString, out parsedDouble)) if (!double.TryParse(fieldString, out parsedDouble))
@ -659,7 +656,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedDouble); properties[j].SetValue(record, parsedDouble);
} }
else if (columnTypes[j] == typeof(UInt32Type)) else if (columnTypes[j].GetType() == typeof(UInt32Type))
{ {
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
{ {
@ -668,7 +665,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedUInt32); properties[j].SetValue(record, parsedUInt32);
} }
else if (columnTypes[j] == typeof(UInt64Type)) else if (columnTypes[j].GetType() == typeof(UInt64Type))
{ {
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
{ {
@ -677,7 +674,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedUInt64); properties[j].SetValue(record, parsedUInt64);
} }
else if (columnTypes[j] == typeof(Int32Type)) else if (columnTypes[j].GetType() == typeof(Int32Type))
{ {
if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
{ {
@ -686,7 +683,7 @@ public class SaneTsv
properties[j].SetValue(record, parsedInt32); properties[j].SetValue(record, parsedInt32);
} }
else if (columnTypes[j] == typeof(Int64Type)) else if (columnTypes[j].GetType() == typeof(Int64Type))
{ {
if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
{ {
@ -846,7 +843,7 @@ public class SaneTsv
public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer) public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
{ {
string[] columnNames = null; string[] columnNames = null;
var headers = new List<byte[]>(); var headerFields = new List<byte[]>();
var fieldBytes = new List<byte>(); var fieldBytes = new List<byte>();
int startOfData = -1; int startOfData = -1;
for (int i = 0; i < inputBuffer.Count(); i++) for (int i = 0; i < inputBuffer.Count(); i++)
@ -885,33 +882,33 @@ public class SaneTsv
else if (inputBuffer[i] == '\t') else if (inputBuffer[i] == '\t')
{ {
// end of field // end of field
headers.Add(fieldBytes.ToArray()); headerFields.Add(fieldBytes.ToArray());
fieldBytes.Clear(); fieldBytes.Clear();
} }
else if (inputBuffer[i] == '\n') else if (inputBuffer[i] == '\n')
{ {
// This is the end of the header // This is the end of the header
headers.Add(fieldBytes.ToArray()); headerFields.Add(fieldBytes.ToArray());
startOfData = i + 1; startOfData = i + 1;
columnNames = new string[headers.Count]; columnNames = new string[headerFields.Count];
fieldBytes.Clear(); fieldBytes.Clear();
for (int j = 0; j < headers.Count; j++) for (int j = 0; j < headerFields.Count; j++)
{ {
string columnString; string columnString;
try try
{ {
columnString = Encoding.UTF8.GetString(headers[j]); columnString = Encoding.UTF8.GetString(headerFields[j]);
} }
catch (Exception e) catch (Exception e)
{ {
throw new Exception($"Column {headers.Count} name is not valid UTF-8", e); throw new Exception($"Column {headerFields.Count} name is not valid UTF-8", e);
} }
if (columnString.Contains(':')) if (columnString.Contains(':'))
{ {
throw new Exception($"Header {headers.Count} contain ':', which is not allowed for column names"); throw new Exception($"Header field {headerFields.Count} contain ':', which is not allowed for column names");
} }
columnNames[j] = columnString; columnNames[j] = columnString;
@ -939,7 +936,7 @@ public class SaneTsv
var fields = new List<byte[]>(); var fields = new List<byte[]>();
var records = new List<string[]>(); var records = new List<string[]>();
int line = 1; int line = 2;
int currentLineStart = 0; int currentLineStart = 0;
// Go back to the start of the current line // Go back to the start of the current line
@ -1033,13 +1030,20 @@ public class SaneTsv
fields.Add(fieldBytes.ToArray()); fields.Add(fieldBytes.ToArray());
if (fields.Count == 0) if (fields.Count == 0 && endIndex == inputBuffer.Length)
{ {
throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
} }
if (numFields != fields.Count) if (numFields != fields.Count)
{ {
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); if (endIndex == inputBuffer.Length)
{
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
return records.ToArray();
}
} }
else else
{ {
@ -1062,99 +1066,151 @@ public class SaneTsv
return records.ToArray(); return records.ToArray();
} }
public static Type GetColumnFromType(Type type) public static ColumnType GetColumnFromString(string type)
{ {
if (type == typeof(string)) if (type == "string")
{ {
return typeof(StringType); return new StringType();
} }
else if (type == typeof(bool)) else if (type == "boolean")
{ {
return typeof(BooleanType); return new BooleanType();
} }
else if (type == typeof(float)) else if (type == "float32")
{ {
return typeof(Float32Type); return new Float32Type();
} }
else if (type == typeof(double)) else if (type == "float32-le")
{ {
return typeof(Float64Type); return new Float32LEType();
} }
else if (type == typeof(UInt32)) else if (type == "float64")
{ {
return typeof(UInt32Type); return new Float64Type();
} }
else if (type == typeof(UInt64)) else if (type == "float64-le")
{ {
return typeof(UInt64Type); return new Float64LEType();
} }
else if (type == typeof(Int32)) else if (type == "uint32")
{ {
return typeof(Int32Type); return new UInt32Type();
} }
else if (type == typeof(Int64)) else if (type == "uint64")
{ {
return typeof(Int64Type); return new UInt64Type();
} }
else if (type == typeof(byte[])) else if (type == "int32")
{ {
return typeof(BinaryType); return new Int32Type();
}
else if (type == "int64")
{
return new Int64Type();
}
else if (type == "binary")
{
return new BinaryType();
} }
else else
{ {
throw new Exception($"Invalid type: {type}"); throw new Exception($"Invalid type: {type.GetType()}");
} }
} }
public static string GetNameFromColumn(Type type) public static ColumnType GetColumnFromType(Type type)
{ {
if (type == typeof(StringType)) if (type == typeof(string))
{
return new StringType();
}
else if (type == typeof(bool))
{
return new BooleanType();
}
else if (type == typeof(float))
{
return new Float32Type();
}
else if (type == typeof(double))
{
return new Float64Type();
}
else if (type == typeof(UInt32))
{
return new UInt32Type();
}
else if (type == typeof(UInt64))
{
return new UInt64Type();
}
else if (type == typeof(Int32))
{
return new Int32Type();
}
else if (type == typeof(Int64))
{
return new Int64Type();
}
else if (type == typeof(byte[]))
{
return new BinaryType();
}
else
{
throw new Exception($"Invalid type: {type.GetType()}");
}
}
public static string GetNameFromColumn(ColumnType type)
{
if (type.GetType() == typeof(StringType))
{ {
return "string"; return "string";
} }
else if (type == typeof(BooleanType)) else if (type.GetType() == typeof(BooleanType))
{ {
return "boolean"; return "boolean";
} }
else if (type == typeof(Float32Type)) else if (type.GetType() == typeof(Float32Type))
{ {
return "float32"; return "float32";
} }
else if (type == typeof(Float32LEType)) else if (type.GetType() == typeof(Float32LEType))
{ {
return "float32-le"; return "float32-le";
} }
else if (type == typeof(Float64Type)) else if (type.GetType() == typeof(Float64Type))
{ {
return "float64"; return "float64";
} }
else if (type == typeof(Float64LEType)) else if (type.GetType() == typeof(Float64LEType))
{ {
return "float64-le"; return "float64-le";
} }
else if (type == typeof(UInt32Type)) else if (type.GetType() == typeof(UInt32Type))
{ {
return "uint32"; return "uint32";
} }
else if (type == typeof(UInt64Type)) else if (type.GetType() == typeof(UInt64Type))
{ {
return "uint64"; return "uint64";
} }
else if (type == typeof(Int32Type)) else if (type.GetType() == typeof(Int32Type))
{ {
return "int32"; return "int32";
} }
else if (type == typeof(Int64Type)) else if (type.GetType() == typeof(Int64Type))
{ {
return "int64"; return "int64";
} }
else if (type == typeof(BinaryType)) else if (type.GetType() == typeof(BinaryType))
{ {
return "binary"; return "binary";
} }
else else
{ {
throw new Exception($"Invalid type: {type}"); throw new Exception($"Invalid type: {type.GetType()}");
} }
} }
@ -1170,16 +1226,26 @@ public class SaneTsv
public static byte[] SerializeCommentedTsv<T>(IList<T> data, string fileComment) where T : CommentedTsvRecord public static byte[] SerializeCommentedTsv<T>(IList<T> data, string fileComment) where T : CommentedTsvRecord
{ {
return SerializeTsv<T>(data, FormatType.COMMENTED_TSV); return SerializeTsv<T>(data, FormatType.COMMENTED_TSV, fileComment);
} }
protected static byte[] SerializeTsv<T>(IList<T> data, FormatType tsvFormat) protected static byte[] SerializeTsv<T>(IList<T> data, FormatType tsvFormat, string fileComment = null)
{ {
var bytes = new List<byte>(); var bytes = new List<byte>();
var headerTypes = new List<Type>(); if (fileComment != null)
var headerNames = new List<string>(); {
var headerPropertyInfos = new List<PropertyInfo>(); if (tsvFormat != FormatType.COMMENTED_TSV)
{
throw new Exception($"File comments are not valid for {tsvFormat}");
}
bytes.AddRange(Encoding.UTF8.GetBytes("#" + fileComment.Replace("\n", "\n#") + "\n"));
}
var columnTypes = new List<ColumnType>();
var columnNames = new List<string>();
var columnPropertyInfos = new List<PropertyInfo>();
int columnCount = 0; int columnCount = 0;
// Serialize header // Serialize header
@ -1192,30 +1258,30 @@ public class SaneTsv
} }
string headerName = attribute.ColumnName ?? property.Name; string headerName = attribute.ColumnName ?? property.Name;
headerNames.Add(headerName); columnNames.Add(headerName);
Type headerType = attribute.ColumnType ?? GetColumnFromType(property.PropertyType); ColumnType headerType = attribute.ColumnType ?? GetColumnFromType(property.PropertyType);
if (tsvFormat == FormatType.SIMPLE_TSV && headerType != typeof(StringType)) if (tsvFormat == FormatType.SIMPLE_TSV && headerType.GetType() != typeof(StringType))
{ {
throw new Exception($"Serializing Simple TSV requires all columns be of type string, but column '{headerName}' has type '{headerType}'"); throw new Exception($"Serializing Simple TSV requires all columns be of type string, but column '{headerName}' has type '{headerType}'");
} }
headerTypes.Add(headerType); columnTypes.Add(headerType);
headerPropertyInfos.Add(property); columnPropertyInfos.Add(property);
// TODO: Check that the property type and given column type are compatible // TODO: Check that the property type and given column type are compatible
columnCount++; columnCount++;
} }
// Serialize header // Serialize header
for (int i = 0; i < headerNames.Count; i++) for (int i = 0; i < columnNames.Count; i++)
{ {
for (int j = i + 1; j < headerNames.Count; j++) for (int j = i + 1; j < columnNames.Count; j++)
{ {
if (headerNames[i] == headerNames[j]) if (columnNames[i] == columnNames[j])
{ {
throw new Exception("Column names in header must be unique"); throw new Exception("Column names in header must be unique");
} }
} }
byte[] nameEncoded = Encoding.UTF8.GetBytes(headerNames[i]); byte[] nameEncoded = Encoding.UTF8.GetBytes(columnNames[i]);
for (int j = 0; j < nameEncoded.Length; j++) for (int j = 0; j < nameEncoded.Length; j++)
{ {
@ -1250,15 +1316,15 @@ public class SaneTsv
bytes.Add((byte)':'); bytes.Add((byte)':');
try try
{ {
bytes.AddRange(Encoding.UTF8.GetBytes(GetNameFromColumn(headerTypes[i]))); bytes.AddRange(Encoding.UTF8.GetBytes(GetNameFromColumn(columnTypes[i])));
} }
catch (Exception e) catch (Exception e)
{ {
throw new Exception($"Invalid header type for column {i}", e); throw new Exception($"Invalid column type for column {i}", e);
} }
} }
if (i == headerNames.Count - 1) if (i == columnNames.Count - 1)
{ {
bytes.Add((byte)'\n'); bytes.Add((byte)'\n');
} }
@ -1269,19 +1335,19 @@ public class SaneTsv
} }
// Serialize data // Serialize data
SerializeTsv<T>(data, bytes, headerPropertyInfos.ToArray(), headerTypes.ToArray(), tsvFormat, 0, data.Count); SerializeTsv<T>(data, bytes, columnPropertyInfos.ToArray(), columnTypes.ToArray(), tsvFormat, 0, data.Count);
return bytes.ToArray(); return bytes.ToArray();
} }
protected static void SerializeTsv<T>(IList<T> data, List<byte> bytes, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, FormatType tsvFormat, int startIndex, int endIndex) protected static void SerializeTsv<T>(IList<T> data, List<byte> bytes, PropertyInfo[] columnPropertyInfos, ColumnType[] columnTypes, FormatType tsvFormat, int startIndex, int endIndex)
{ {
// Serialize data // Serialize data
for (int i = 0; i < data.Count; i++) for (int i = 0; i < data.Count; i++)
{ {
for (int j = 0; j < headerTypes.Length; j++) for (int j = 0; j < columnTypes.Length; j++)
{ {
object datum = headerPropertyInfos[j].GetValue(data[i]); object datum = columnPropertyInfos[j].GetValue(data[i]);
try try
{ {
@ -1289,16 +1355,16 @@ public class SaneTsv
// Some fields definitely don't need escaping, so we add them directly to bytes // Some fields definitely don't need escaping, so we add them directly to bytes
bool skipEscaping = false; bool skipEscaping = false;
if (headerTypes[j] == typeof(StringType)) if (columnTypes[j].GetType() == typeof(StringType))
{ {
fieldEncoded = Encoding.UTF8.GetBytes((string)datum); fieldEncoded = Encoding.UTF8.GetBytes((string)datum);
} }
else if (headerTypes[j] == typeof(BooleanType)) else if (columnTypes[j].GetType() == typeof(BooleanType))
{ {
bytes.AddRange((bool)datum ? TrueEncoded : FalseEncoded); bytes.AddRange((bool)datum ? TrueEncoded : FalseEncoded);
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(Float32Type)) else if (columnTypes[j].GetType() == typeof(Float32Type))
{ {
if (datum is float f) if (datum is float f)
{ {
@ -1322,7 +1388,7 @@ public class SaneTsv
} }
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(Float32LEType)) else if (columnTypes[j].GetType() == typeof(Float32LEType))
{ {
if (LittleEndian) if (LittleEndian)
{ {
@ -1338,7 +1404,7 @@ public class SaneTsv
} }
} }
} }
else if (headerTypes[j] == typeof(Float64Type)) else if (columnTypes[j].GetType() == typeof(Float64Type))
{ {
if (datum is double d) if (datum is double d)
{ {
@ -1362,7 +1428,7 @@ public class SaneTsv
} }
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(Float64LEType)) else if (columnTypes[j].GetType() == typeof(Float64LEType))
{ {
if (LittleEndian) if (LittleEndian)
{ {
@ -1378,33 +1444,33 @@ public class SaneTsv
} }
} }
} }
else if (headerTypes[j] == typeof(UInt32Type)) else if (columnTypes[j].GetType() == typeof(UInt32Type))
{ {
bytes.AddRange(Encoding.UTF8.GetBytes(((UInt32)datum).ToString())); bytes.AddRange(Encoding.UTF8.GetBytes(((UInt32)datum).ToString()));
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(UInt64Type)) else if (columnTypes[j].GetType() == typeof(UInt64Type))
{ {
bytes.AddRange(Encoding.UTF8.GetBytes(((UInt64)datum).ToString())); bytes.AddRange(Encoding.UTF8.GetBytes(((UInt64)datum).ToString()));
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(Int32Type)) else if (columnTypes[j].GetType() == typeof(Int32Type))
{ {
bytes.AddRange(Encoding.UTF8.GetBytes(((Int32)datum).ToString())); bytes.AddRange(Encoding.UTF8.GetBytes(((Int32)datum).ToString()));
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(Int64Type)) else if (columnTypes[j].GetType() == typeof(Int64Type))
{ {
bytes.AddRange(Encoding.UTF8.GetBytes(((Int64)datum).ToString())); bytes.AddRange(Encoding.UTF8.GetBytes(((Int64)datum).ToString()));
skipEscaping = true; skipEscaping = true;
} }
else if (headerTypes[j] == typeof(BinaryType)) else if (columnTypes[j].GetType() == typeof(BinaryType))
{ {
fieldEncoded = (byte[])datum; fieldEncoded = (byte[])datum;
} }
else else
{ {
throw new Exception($"Unexpected column type {headerTypes[j]} for column {j}"); throw new Exception($"Unexpected column type {columnTypes[j]} for column {j}");
} }
if (!skipEscaping) if (!skipEscaping)
@ -1438,7 +1504,7 @@ public class SaneTsv
} }
} }
if (j < headerTypes.Length - 1) if (j < columnTypes.Length - 1)
{ {
bytes.Add((byte)'\t'); bytes.Add((byte)'\t');
} }
@ -1449,7 +1515,7 @@ public class SaneTsv
} }
catch (InvalidCastException e) catch (InvalidCastException e)
{ {
throw new Exception($"Record {i}, field {j} expected type compatible with {GetNameFromColumn(headerTypes[j])}", e); throw new Exception($"Record {i}, field {j} expected type compatible with {GetNameFromColumn(columnTypes[j])}", e);
} }
} }
} }
@ -1508,15 +1574,15 @@ public class SaneTsv
public class TsvColumnAttribute : Attribute public class TsvColumnAttribute : Attribute
{ {
public string ColumnName { get; } public string ColumnName { get; }
public virtual Type ColumnType { get; } public virtual ColumnType ColumnType { get; }
public TsvColumnAttribute() public TsvColumnAttribute()
{ {
ColumnType = typeof(StringType); ColumnType = new StringType();
} }
public TsvColumnAttribute(string columnName) public TsvColumnAttribute(string columnName)
{ {
ColumnType = typeof(StringType); ColumnType = new StringType();
ColumnName = columnName; ColumnName = columnName;
} }
} }
@ -1524,27 +1590,19 @@ public class SaneTsv
// TODO: Add column ordering // TODO: Add column ordering
public class TypedTsvColumnAttribute : TsvColumnAttribute public class TypedTsvColumnAttribute : TsvColumnAttribute
{ {
public override Type ColumnType { get; } public override ColumnType ColumnType { get; }
public TypedTsvColumnAttribute() { } public TypedTsvColumnAttribute() { }
public TypedTsvColumnAttribute(string columnName) : base(columnName) { } public TypedTsvColumnAttribute(string columnName) : base(columnName) { }
public TypedTsvColumnAttribute(string columnName, Type columnType) : base(columnName) public TypedTsvColumnAttribute(string columnName, string columnType) : base(columnName)
{ {
if (columnType.BaseType != typeof(ColumnType)) ColumnType = GetColumnFromString(columnType);
{
throw new Exception("Column type must inherit from SaneTsv.ColumnType");
}
ColumnType = columnType;
} }
public TypedTsvColumnAttribute(Type columnType) public TypedTsvColumnAttribute(ColumnType columnType)
{ {
if (columnType.BaseType != typeof(ColumnType))
{
throw new Exception("Column type must inherit from SaneTsv.ColumnType");
}
ColumnType = columnType; ColumnType = columnType;
} }
} }

View File

@ -1,4 +1,5 @@
using NathanMcRae; using NathanMcRae;
using System.Reflection;
using System.Text; using System.Text;
internal class Program : SaneTsv internal class Program : SaneTsv
@ -80,6 +81,18 @@ internal class Program : SaneTsv
public string Column3 { get; set; } public string Column3 { get; set; }
} }
public class BoolTestRecord3 : SaneTsv.CommentedTsvRecord
{
[SaneTsv.TsvColumn("column1")]
public string Column1 { get; set; }
[SaneTsv.TsvColumn]
public string column2 { get; set; }
[SaneTsv.TsvColumn("columnthree\nyep")]
public string Column3 { get; set; }
}
public class SerdeTestRecord : SaneTsv.CommentedTsvRecord public class SerdeTestRecord : SaneTsv.CommentedTsvRecord
{ {
[SaneTsv.TypedTsvColumn("column1")] [SaneTsv.TypedTsvColumn("column1")]
@ -348,6 +361,428 @@ internal class Program : SaneTsv
Console.WriteLine($"Unspecced parse time: {unspeccedParseTime}"); Console.WriteLine($"Unspecced parse time: {unspeccedParseTime}");
} }
{
string testName = "With and without file comment";
string testString1 = "#This is a file comment\n" +
"#One more file comment line\n" +
"column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\n#This is a comment" +
"\n#Another comment line" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
string testString2 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\n#This is a comment" +
"\n#Another comment line" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
CommentedTsv<BoolTestRecord2> parsed2 = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString2));
if (parsed.FileComment == "This is a file comment\nOne more file comment line" && parsed2.FileComment == null)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
{
string testName = "With and without types";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Passed {testName} 1A");
}
catch (Exception e)
{
Console.WriteLine($"Failed {testName} 1A");
}
try
{
Tsv<BoolTestRecord2> parsed2 = SaneTsv.ParseSimpleTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1B");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1C");
}
string testString2 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 2A");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 2A");
}
try
{
Tsv<BoolTestRecord2> parsed2 = SaneTsv.ParseSimpleTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 2B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 2B");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 2C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 2C");
}
}
{
string testName = "With and without line comment";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\n#This is a comment" +
"\n#Another comment line" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
try
{
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Passed {testName} 1A");
}
catch (Exception e)
{
Console.WriteLine($"Failed {testName} 1A");
}
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1B");
}
try
{
Tsv<BoolTestRecord2> parsed2 = SaneTsv.ParseSimpleTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1C");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1D");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1D");
}
}
{
string testName = "End of file comment";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\n# Hey, you're not supposed to have comments at the end of the tsv!";
try
{
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1A");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1A");
}
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1B");
}
string testString2 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\n# Hey, you're not supposed to have comments at the end of the tsv!";
try
{
Tsv<BoolTestRecord3> parsed3 = SaneTsv.ParseSimpleTsv<BoolTestRecord3>(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1C");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1D");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1D");
}
}
{
string testName = "Partial parsing";
string line1 = "column1\tcolumn2\tcolumnthree\\nyep";
string line2 = "\nTRUE\tvalue\\\\t\0woo\tvaluetrhee";
string line3 = "\nFALSE\tnother\tno\\ther";
byte[] inputBuffer = Encoding.UTF8.GetBytes(line1 + line2 + line3);
var headerTypes = new List<Type>();
var headerNames = new List<string>();
var headerPropertyInfos = new List<PropertyInfo>();
int columnCount = 0;
foreach (PropertyInfo property in typeof(BoolTestRecord3).GetProperties())
{
TsvColumnAttribute attribute = (TsvColumnAttribute)Attribute.GetCustomAttribute(property, typeof(TsvColumnAttribute));
if (attribute == null)
{
continue;
}
headerNames.Add(attribute.ColumnName ?? property.Name);
headerTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType));
headerPropertyInfos.Add(property);
// TODO: Check that the property type and given column type are compatible
columnCount++;
}
BoolTestRecord3[] records = SaneTsv.Parse<BoolTestRecord3>(inputBuffer,
FormatType.SIMPLE_TSV,
headerPropertyInfos.ToArray(),
headerTypes.ToArray(),
line1.Length + line2.Length + 1,
inputBuffer.Length);
if (records.Length == 0 )
{
Console.WriteLine($"Passed {testName} 1");
}
else
{
Console.WriteLine($"Failed {testName} 1");
}
BoolTestRecord3[] records2 = SaneTsv.Parse<BoolTestRecord3>(inputBuffer,
FormatType.SIMPLE_TSV,
headerPropertyInfos.ToArray(),
headerTypes.ToArray(),
line1.Length,
line1.Length + 3);
if (records2[0].Column3 == "valuetrhee")
{
Console.WriteLine($"Passed {testName} 2");
}
else
{
Console.WriteLine($"Failed {testName} 2");
}
string[][] data = SaneTsv.ParseSimpleTsv(inputBuffer, 3, line1.Length + line2.Length + 1, inputBuffer.Length);
if (data[0][1] == "nother")
{
Console.WriteLine($"Passed {testName} 3");
}
else
{
Console.WriteLine($"Failed {testName} 3");
}
string[][] data2 = SaneTsv.ParseSimpleTsv(inputBuffer, 3, line1.Length, line1.Length + 3);
if (data2.Length == 0)
{
Console.WriteLine($"Passed {testName} 4");
}
else
{
Console.WriteLine($"Failed {testName} 4");
}
}
{
string testName = "End of file \\n";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\n";
try
{
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1A");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1A");
}
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1B");
}
string testString2 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\n";
try
{
Tsv<BoolTestRecord3> parsed3 = SaneTsv.ParseSimpleTsv<BoolTestRecord3>(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1C");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1D");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1D");
}
}
{
string testName = "End of file partial record";
string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\nTRUE\t";
try
{
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1A");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1A");
}
try
{
Tsv<BoolTestRecord2> parsed = SaneTsv.ParseTypedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
Console.WriteLine($"Failed {testName} 1B");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1B");
}
string testString2 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther" +
"\nTRUE\t";
try
{
Tsv<BoolTestRecord3> parsed3 = SaneTsv.ParseSimpleTsv<BoolTestRecord3>(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1C");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1C");
}
try
{
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString2));
Console.WriteLine($"Failed {testName} 1D");
}
catch (Exception e)
{
Console.WriteLine($"Passed {testName} 1D");
}
}
{
string testName = "File comment serde";
string testString1 = "#this is a file comment" +
"\n# and one more line since you're such a good customer" +
"\ncolumn1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
CommentedTsv<BoolTestRecord2> parsed = SaneTsv.ParseCommentedTsv<BoolTestRecord2>(Encoding.UTF8.GetBytes(testString1));
string reserialized = Encoding.UTF8.GetString(SaneTsv.SerializeCommentedTsv<BoolTestRecord2>(parsed.Records, parsed.FileComment));
if (reserialized == testString1)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
Console.WriteLine("Done with tests"); Console.WriteLine("Done with tests");
} }
} }

View File

@ -3,14 +3,43 @@
## Roadmap ## Roadmap
- Improve error reporting by including line/column information in exceptions - Improve error reporting by including line/column information in exceptions
- Come up with a static-typing interface - Use this to get line numbers for parallel parsing implementations
- [x] Come up with a static-typing interface
Something that doesn't require an array of objects Something that doesn't require an array of objects
Use a class with SaveTsv attributes
- Check numeric formatting matches spec - Check numeric formatting matches spec
- Do parallel parsing / serializing implementation - [x] Maybe add a binary representation for f32/f64. It should specify that it is Little-endian (since we have to pick one). That way we can guarantee bit-compatibility between implementations where an application might require that.
- Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in. - [x] Add Column name/type specification to API
- More optimization and making parsing modular: - So you can tell it what columns to expect
- [ ] Lax/strict versions
See the attributes thing above
- Generate test cases
- [x] File comment / no file comment
- [x] header types / no header types
- [x] Line comments / no line comments
- [x] end of file comment
- [x] Test with the start index of parallel methods in last record
- end index in first record
- [x] Extra \n at end of file
- [x] Wrong number of fields
- Wrong number of fields at end of file
- [x] Do parallel parsing / serializing implementation
- [x] Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in.
- ~~More optimization and making parsing modular:~~
- Have callbacks for header parsing and field parsing - Have callbacks for header parsing and field parsing
- That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again. - That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again.
- Finish ExtraTSV implementation - [x] Make untyped Simple TSV (De)serialization
- Do zig implementation - [x] ~~Finish~~ Minimal ExtraTSV implementation
- [ ] Do zig implementation
- Make a c interface from that - Make a c interface from that
- Make a commandline interface
- Make a viewer / editor
- Streaming interface
So you can start processing your data while it finishes parsing?
- [ ] Decoding a binary stream with a \0 in it via UTF-8 doesn't seem to cause any issues. I thought that valid UTF-8 wouldn't have a \0?
- [ ] Instead of exceptions when parsing, we should parse as much as possible and reflect parsing errors in the returned data structure