diff --git a/SaneTsv/SaneTsv.cs b/SaneTsv/SaneTsv.cs index 76d8cf2..7966bcc 100644 --- a/SaneTsv/SaneTsv.cs +++ b/SaneTsv/SaneTsv.cs @@ -20,13 +20,36 @@ public class SaneTsv BINARY, } + protected enum FormatType + { + SANE_TSV = 0, + TYPED_TSV = 1, + COMMENTED_TSV = 2, + } + // TODO: We need to be able to update all these in tandem somehow public string[] ColumnNames { get; protected set; } public ColumnType[] ColumnTypes { get; protected set; } public Dictionary> Columns { get; protected set; } public List Records { get; protected set; } - public static SaneTsv Parse(byte[] inputBuffer) + public static SaneTsv ParseSaneTsv(byte[] inputBuffer) + { + return Parse(inputBuffer, FormatType.SANE_TSV); + } + + public static SaneTsv ParseTypedTsv(byte[] inputBuffer) + { + return Parse(inputBuffer, FormatType.TYPED_TSV); + } + + public static SaneTsv ParseCommentedTsv(byte[] inputBuffer) + { + return Parse(inputBuffer, FormatType.COMMENTED_TSV); + } + + // TODO: Have parsing errors include line / column # + protected static SaneTsv Parse(byte[] inputBuffer, FormatType format) { var parsed = new SaneTsv(); parsed.Columns = new Dictionary>(); @@ -37,6 +60,8 @@ public class SaneTsv var fieldBytes = new List(); var fields = new List(); int numFields = -1; + int line = 1; + int currentLineStart = 0; for (int i = 0; i < inputBuffer.Count(); i++) { if (inputBuffer[i] == '\\') @@ -60,6 +85,11 @@ public class SaneTsv fieldBytes.Add((byte)'\t'); i++; } + else if (inputBuffer[i + 1] == '#') + { + fieldBytes.Add((byte)'#'); + i++; + } else { throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}"); @@ -101,12 +131,20 @@ public class SaneTsv string columnTypeString; string columnName; - if (columnString.Contains(":")) { + if (columnString.Contains(':')) { + if (format == FormatType.SANE_TSV) + { + throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); + } columnTypeString = columnString.Split(":").Last(); columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); } else { + if (format > FormatType.SANE_TSV) + { + throw new Exception($"Header {fields.Count} has no type"); + } columnTypeString = ""; columnName = columnString; } @@ -163,21 +201,23 @@ public class SaneTsv parsed.ColumnTypes[j] = type; } - if (numTypesBlank != 0 && numTypesBlank != fields.Count) - { - throw new Exception("Types must be provided for all columns or none. Use 'string' for columns missing types."); - } - fields.Clear(); } else if (numFields != fields.Count) { - throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}"); + throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); } else { - AddRecord(parsed, fields); + AddRecord(parsed, fields, line); } + + line++; + currentLineStart = i + 1; + } + else if (inputBuffer[i] == '#') + { + throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}"); } else { @@ -197,13 +237,13 @@ public class SaneTsv } else { - AddRecord(parsed, fields); + AddRecord(parsed, fields, line); } return parsed; } - protected static void AddRecord(SaneTsv parsed, List fields) + protected static void AddRecord(SaneTsv parsed, List fields, int line) { var parsedFields = new object[fields.Count]; for (int j = 0; j < fields.Count; j++) @@ -223,7 +263,7 @@ public class SaneTsv } catch (Exception e) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UTF-8", e); + throw new Exception($"Field {j} on line {line} is not valid UTF-8", e); } switch (parsed.ColumnTypes[j]) @@ -244,7 +284,7 @@ public class SaneTsv } else { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly"); + throw new Exception($"Field {j} on line {line} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly"); } parsedFields[j] = parsedBool; @@ -253,7 +293,7 @@ public class SaneTsv case ColumnType.FLOAT32: if (!float.TryParse(fieldString, out float parsedFloat)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid single-precision float"); + throw new Exception($"Field {j} on line {line} is not valid single-precision float"); } parsedFields[j] = parsedFloat; @@ -262,7 +302,7 @@ public class SaneTsv case ColumnType.FLOAT64: if (!double.TryParse(fieldString, out double parsedDouble)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid double-precision float"); + throw new Exception($"Field {j} on line {line} is not valid double-precision float"); } parsedFields[j] = parsedDouble; @@ -271,7 +311,7 @@ public class SaneTsv case ColumnType.UINT32: if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt32"); + throw new Exception($"Field {j} on line {line} is not valid UInt32"); } parsedFields[j] = parsedUInt32; @@ -280,7 +320,7 @@ public class SaneTsv case ColumnType.UINT64: if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt64"); + throw new Exception($"Field {j} on line {line} is not valid UInt64"); } parsedFields[j] = parsedUInt64; @@ -289,7 +329,7 @@ public class SaneTsv case ColumnType.INT32: if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int32"); + throw new Exception($"Field {j} on line {line} is not valid Int32"); } parsedFields[j] = parsedInt32; @@ -298,7 +338,7 @@ public class SaneTsv case ColumnType.INT64: if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) { - throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int64"); + throw new Exception($"Field {j} on line {line} is not valid Int64"); } parsedFields[j] = parsedInt64; diff --git a/SaneTsv/SaneTsvTest/Program.cs b/SaneTsv/SaneTsvTest/Program.cs index ee50363..c9b8b25 100644 --- a/SaneTsv/SaneTsvTest/Program.cs +++ b/SaneTsv/SaneTsvTest/Program.cs @@ -3,12 +3,12 @@ using System.Text; { string testName = "Bool test"; - string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + + string testString1 = "column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + "\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" + "\nFALSE\tnother\tno\\ther"; - SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1)); - if (parsed.Records[0]["column1:type"] is bool result && result) + SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1)); + if (parsed.Records[0]["column1:ty#pe"] is bool result && result) { Console.WriteLine($"Passed {testName}"); } @@ -26,7 +26,7 @@ using System.Text; "\nTUE\tvalue\\\\t\0woo\tvaluetrhee" + "\nFALSE\tnother\tno\\ther"; - SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1)); + SaneTsv parsed = SaneTsv.ParseTypedTsv(Encoding.UTF8.GetBytes(testString1)); Console.WriteLine($"Failed {testName}"); } catch (Exception)