using System.Text; namespace NathanMcRae; /// /// Sane Tab-Separated Values /// public class SaneTsv { public enum ColumnType { STRING, BOOLEAN, FLOAT32, FLOAT64, UINT32, UINT64, INT32, INT64, BINARY, } protected enum FormatType { SANE_TSV = 0, TYPED_TSV = 1, COMMENTED_TSV = 2, } // TODO: We need to be able to update all these in tandem somehow public string[] ColumnNames { get; protected set; } public ColumnType[] ColumnTypes { get; protected set; } public Dictionary> Columns { get; protected set; } public List Records { get; protected set; } public static SaneTsv ParseSaneTsv(byte[] inputBuffer) { return Parse(inputBuffer, FormatType.SANE_TSV); } public static SaneTsv ParseTypedTsv(byte[] inputBuffer) { return Parse(inputBuffer, FormatType.TYPED_TSV); } public static SaneTsv ParseCommentedTsv(byte[] inputBuffer) { return Parse(inputBuffer, FormatType.COMMENTED_TSV); } // TODO: Have parsing errors include line / column # protected static SaneTsv Parse(byte[] inputBuffer, FormatType format) { var parsed = new SaneTsv(); parsed.Columns = new Dictionary>(); parsed.ColumnNames = new string[] { }; parsed.ColumnTypes = new ColumnType[] { }; parsed.Records = new List(); var fieldBytes = new List(); var fields = new List(); int numFields = -1; int line = 1; int currentLineStart = 0; for (int i = 0; i < inputBuffer.Count(); i++) { if (inputBuffer[i] == '\\') { if (i + 1 == inputBuffer.Count()) { throw new Exception($"Found '\\' at end of input"); } if (inputBuffer[i + 1] == 'n') { fieldBytes.Add((byte)'\n'); i++; } else if (inputBuffer[i + 1] == '\\') { fieldBytes.Add((byte)'\\'); i++; } else if (inputBuffer[i + 1] == 't') { fieldBytes.Add((byte)'\t'); i++; } else if (inputBuffer[i + 1] == '#') { fieldBytes.Add((byte)'#'); i++; } else { throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}"); } } else if (inputBuffer[i] == '\t') { // end of field fields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); } else if (inputBuffer[i] == '\n') { fields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); if (numFields < 0) { // This is the header numFields = fields.Count; parsed.ColumnNames = new string[numFields]; parsed.ColumnTypes = new ColumnType[numFields]; int numTypesBlank = 0; for (int j = 0; j < fields.Count; j++) { string columnString; try { columnString = Encoding.UTF8.GetString(fields[j]); } catch (Exception e) { throw new Exception($"Header {fields.Count} is not valid UTF-8", e); } string columnTypeString; string columnName; if (columnString.Contains(':')) { if (format == FormatType.SANE_TSV) { throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); } columnTypeString = columnString.Split(":").Last(); columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); } else { if (format > FormatType.SANE_TSV) { throw new Exception($"Header {fields.Count} has no type"); } columnTypeString = ""; columnName = columnString; } ColumnType type; switch (columnTypeString) { case "": numTypesBlank++; type = ColumnType.STRING; break; case "string": type = ColumnType.STRING; break; case "boolean": type = ColumnType.BOOLEAN; break; case "float32": type = ColumnType.FLOAT32; break; case "float64": type = ColumnType.FLOAT64; break; case "uint32": type = ColumnType.UINT32; break; case "uint64": type = ColumnType.UINT64; break; case "int32": type = ColumnType.INT32; break; case "int64": type = ColumnType.INT64; break; case "binary": type = ColumnType.BINARY; break; default: throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); } try { parsed.Columns.Add(columnName, new List()); } catch (Exception e) { throw new Exception($"Column name {columnName} is not unique", e); } parsed.ColumnNames[j] = columnName; parsed.ColumnTypes[j] = type; } fields.Clear(); } else if (numFields != fields.Count) { throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); } else { AddRecord(parsed, fields, line); } line++; currentLineStart = i + 1; } else if (inputBuffer[i] == '#') { throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}"); } else { fieldBytes.Add(inputBuffer[i]); } } fields.Add(fieldBytes.ToArray()); if (numFields == 0) { throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); } if (numFields != fields.Count) { throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}"); } else { AddRecord(parsed, fields, line); } return parsed; } protected static void AddRecord(SaneTsv parsed, List fields, int line) { var parsedFields = new object[fields.Count]; for (int j = 0; j < fields.Count; j++) { // All other types require the content to be UTF-8. Binary fields can ignore that. if (parsed.ColumnTypes[j] == ColumnType.BINARY) { parsedFields[j] = fields[j]; parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]); continue; } string fieldString; try { fieldString = Encoding.UTF8.GetString(fields[j]); } catch (Exception e) { throw new Exception($"Field {j} on line {line} is not valid UTF-8", e); } switch (parsed.ColumnTypes[j]) { case ColumnType.STRING: parsedFields[j] = fieldString; parsed.Columns[parsed.ColumnNames[j]].Add(fieldString); break; case ColumnType.BOOLEAN: bool parsedBool; if (fieldString == "TRUE") { parsedBool = true; } else if (fieldString == "FALSE") { parsedBool = false; } else { throw new Exception($"Field {j} on line {line} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly"); } parsedFields[j] = parsedBool; parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool); break; case ColumnType.FLOAT32: if (!float.TryParse(fieldString, out float parsedFloat)) { throw new Exception($"Field {j} on line {line} is not valid single-precision float"); } parsedFields[j] = parsedFloat; parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat); break; case ColumnType.FLOAT64: if (!double.TryParse(fieldString, out double parsedDouble)) { throw new Exception($"Field {j} on line {line} is not valid double-precision float"); } parsedFields[j] = parsedDouble; parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble); break; case ColumnType.UINT32: if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) { throw new Exception($"Field {j} on line {line} is not valid UInt32"); } parsedFields[j] = parsedUInt32; parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32); break; case ColumnType.UINT64: if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) { throw new Exception($"Field {j} on line {line} is not valid UInt64"); } parsedFields[j] = parsedUInt64; parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64); break; case ColumnType.INT32: if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) { throw new Exception($"Field {j} on line {line} is not valid Int32"); } parsedFields[j] = parsedInt32; parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32); break; case ColumnType.INT64: if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) { throw new Exception($"Field {j} on line {line} is not valid Int64"); } parsedFields[j] = parsedInt64; parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64); break; case ColumnType.BINARY: throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); default: throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); } } parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields)); fields.Clear(); } public SaneTsvRecord this[int i] => Records[i]; public class SaneTsvRecord { public SaneTsv Parent { get; } public object[] Fields { get; } public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; public SaneTsvRecord(SaneTsv parent, object[] fields) { Parent = parent; Fields = fields; } } }