diff --git a/SaneTsv.cs b/SaneTsv.cs index af4db5a..93e7c45 100644 --- a/SaneTsv.cs +++ b/SaneTsv.cs @@ -7,20 +7,37 @@ namespace NathanMcRae; /// public class SaneTsv { + public enum ColumnType + { + STRING, + BOOLEAN, + FLOAT32, + FLOAT64, + UINT32, + UINT64, + INT32, + INT64, + BINARY, + } + // TODO: We need to be able to update all these in tandem somehow public string[] ColumnNames { get; protected set; } - public Dictionary> Columns { get; protected set; } + public ColumnType[] ColumnTypes { get; protected set; } + public Dictionary> Columns { get; protected set; } public List Records { get; protected set; } + // TODO: Parse with specified columns / types + public static SaneTsv Parse(byte[] inputBuffer) { var parsed = new SaneTsv(); - parsed.Columns = new Dictionary>(); + parsed.Columns = new Dictionary>(); parsed.ColumnNames = new string[] { }; + parsed.ColumnTypes = new ColumnType[] { }; parsed.Records = new List(); var fieldBytes = new List(); - var fields = new List(); + var fields = new List(); int numFields = -1; for (int i = 0; i < inputBuffer.Count(); i++) { @@ -53,26 +70,12 @@ public class SaneTsv else if (inputBuffer[i] == '\t') { // end of field - try - { - fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray())); - } - catch (Exception e) - { - throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e); - } + fields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); } else if (inputBuffer[i] == '\n') { - try - { - fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray())); - } - catch (Exception e) - { - throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e); - } + fields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); if (numFields < 0) @@ -82,14 +85,76 @@ public class SaneTsv numFields = fields.Count; parsed.ColumnNames = new string[numFields]; + parsed.ColumnTypes = new ColumnType[numFields]; + + int numTypesBlank = 0; for (int j = 0; j < fields.Count; j++) { - string columnName = fields[j]; + string columnString; + try + { + columnString = Encoding.UTF8.GetString(fields[j]); + } + catch (Exception e) + { + throw new Exception($"Header {fields.Count} is not valid UTF-8", e); + } + + string columnTypeString; + string columnName; + if (columnString.Contains(":")) { + columnTypeString = columnString.Split(":").Last(); + columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); + } + else + { + columnTypeString = ""; + columnName = columnString; + } + + ColumnType type; + + switch (columnTypeString) + { + case "": + numTypesBlank++; + type = ColumnType.STRING; + break; + case "string": + type = ColumnType.STRING; + break; + case "boolean": + type = ColumnType.BOOLEAN; + break; + case "float32": + type = ColumnType.FLOAT32; + break; + case "float64": + type = ColumnType.FLOAT64; + break; + case "uint32": + type = ColumnType.UINT32; + break; + case "uint64": + type = ColumnType.UINT64; + break; + case "int32": + type = ColumnType.INT32; + break; + case "int64": + type = ColumnType.INT64; + break; + case "binary": + type = ColumnType.BINARY; + break; + default: + throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); + } try { - parsed.Columns.Add(columnName, new List()); + parsed.Columns.Add(columnName, new List()); } catch (Exception e) { @@ -97,6 +162,12 @@ public class SaneTsv } parsed.ColumnNames[j] = columnName; + parsed.ColumnTypes[j] = type; + } + + if (numTypesBlank != 0 && numTypesBlank != fields.Count) + { + throw new Exception("Types must be provided for all columns or none. Use 'string' for columns missing types."); } fields.Clear(); @@ -107,13 +178,7 @@ public class SaneTsv } else { - for (int j = 0; j < fields.Count; j++) - { - parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]); - } - - parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray())); - fields.Clear(); + AddRecord(parsed, fields); } } else @@ -122,50 +187,146 @@ public class SaneTsv } } - try - { - fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray())); - } - catch (Exception e) - { - throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e); - } + fields.Add(fieldBytes.ToArray()); + if (numFields == 0) + { + throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); + } if (numFields != fields.Count) { throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}"); } else { - for (int j = 0; j < fields.Count; j++) - { - try - { - parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]); - } - catch (Exception e) - { - throw new Exception($"Field {j} on line {parsed.Records.Count + 1} is not valid UTF-8", e); - } - } - - parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray())); - fields.Clear(); + AddRecord(parsed, fields); } return parsed; } + protected static void AddRecord(SaneTsv parsed, List fields) + { + var parsedFields = new object[fields.Count]; + for (int j = 0; j < fields.Count; j++) + { + // All other types require the content to be UTF-8. Binary fields can ignore that. + if (parsed.ColumnTypes[j] == ColumnType.BINARY) + { + parsedFields[j] = fields[j]; + parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]); + continue; + } + + string fieldString; + try + { + fieldString = Encoding.UTF8.GetString(fields[j]); + } + catch (Exception e) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UTF-8", e); + } + + switch (parsed.ColumnTypes[j]) + { + case ColumnType.STRING: + parsedFields[j] = fieldString; + parsed.Columns[parsed.ColumnNames[j]].Add(fieldString); + break; + case ColumnType.BOOLEAN: + bool parsedBool; + if (fieldString == "TRUE") + { + parsedBool = true; + } + else if (fieldString == "FALSE") + { + parsedBool = false; + } + else + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly"); + } + + parsedFields[j] = parsedBool; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool); + break; + case ColumnType.FLOAT32: + if (!float.TryParse(fieldString, out float parsedFloat)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid single-precision float"); + } + + parsedFields[j] = parsedFloat; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat); + break; + case ColumnType.FLOAT64: + if (!double.TryParse(fieldString, out double parsedDouble)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid double-precision float"); + } + + parsedFields[j] = parsedDouble; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble); + break; + case ColumnType.UINT32: + if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt32"); + } + + parsedFields[j] = parsedUInt32; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32); + break; + case ColumnType.UINT64: + if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt64"); + } + + parsedFields[j] = parsedUInt64; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64); + break; + case ColumnType.INT32: + if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int32"); + } + + parsedFields[j] = parsedInt32; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32); + break; + case ColumnType.INT64: + if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) + { + throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int64"); + } + + parsedFields[j] = parsedInt64; + parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64); + break; + case ColumnType.BINARY: + throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); + default: + throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); + } + } + + parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields)); + fields.Clear(); + } + public SaneTsvRecord this[int i] => Records[i]; public class SaneTsvRecord { public SaneTsv Parent { get; } - public string[] Fields { get; } + public object[] Fields { get; } - public string this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; + public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; - public SaneTsvRecord(SaneTsv parent, string[] fields) + public SaneTsvRecord(SaneTsv parent, object[] fields) { Parent = parent; Fields = fields; diff --git a/SaneTsvTest/Program.cs b/SaneTsvTest/Program.cs index 5126e29..ee50363 100644 --- a/SaneTsvTest/Program.cs +++ b/SaneTsvTest/Program.cs @@ -1,8 +1,38 @@ using NathanMcRae; using System.Text; -string testString1 = "column1\tcolumn2\tcolumnthree\\nyep\nvalue1\tvalue\\\\twoo\tvaluetrhee\nthis\\nis\\na\\nvalue\tnother\tno\\ther"; +{ + string testName = "Bool test"; + string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + + "\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" + + "\nFALSE\tnother\tno\\ther"; -SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1)); -// See https://aka.ms/new-console-template for more information -Console.WriteLine("Hello, World!"); + SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1)); + if (parsed.Records[0]["column1:type"] is bool result && result) + { + Console.WriteLine($"Passed {testName}"); + } + else + { + Console.WriteLine($"Failed {testName}"); + } +} + +{ + string testName = "Bad bool test"; + try + { + string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + + "\nTUE\tvalue\\\\t\0woo\tvaluetrhee" + + "\nFALSE\tnother\tno\\ther"; + + SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1)); + Console.WriteLine($"Failed {testName}"); + } + catch (Exception) + { + Console.WriteLine($"Passed {testName}"); + } + + Console.WriteLine("Done with tests"); +}