diff --git a/SaneTsv/SaneTsv.cs b/SaneTsv/SaneTsv.cs index 5e7e074..b3bb571 100644 --- a/SaneTsv/SaneTsv.cs +++ b/SaneTsv/SaneTsv.cs @@ -8,7 +8,7 @@ public class Tsv where T : SaneTsv.TsvRecord public virtual List Records { get; set; } } -public class CommentedTsv: Tsv where T : SaneTsv.TsvRecord +public class CommentedTsv : Tsv where T : SaneTsv.TsvRecord { public override List Records { get; set; } public string FileComment { get; set; } = null; @@ -72,7 +72,7 @@ public class SaneTsv public static CommentedTsv ParseCommentedTsv(byte[] inputBuffer) where T : CommentedTsvRecord, new() { // TODO: add the file comment? - return (CommentedTsv)Parse(inputBuffer, FormatType.COMMENTED_TSV); + return (CommentedTsv)Parse(inputBuffer, FormatType.COMMENTED_TSV); } // TODO: Have parsing errors include line / column # @@ -272,7 +272,7 @@ public class SaneTsv throw new Exception("Found a file comment, but parser wasn't expecting a comment"); } } - + fields.Clear(); } @@ -355,7 +355,7 @@ public class SaneTsv return parsed; } - protected static T ParseCurrentCommentedRecord (Type[] columnTypes, PropertyInfo[] properties, List fields, string comment, int line) where T : CommentedTsvRecord, new() + protected static T ParseCurrentCommentedRecord(Type[] columnTypes, PropertyInfo[] properties, List fields, string comment, int line) where T : CommentedTsvRecord, new() { return (T)ParseCurrentRecord(columnTypes, properties, fields, comment, line); } @@ -372,7 +372,7 @@ public class SaneTsv { throw new Exception($"Found comment for line {line}, but format does not support comments"); } - + record.Line = line; for (int j = 0; j < fields.Count; j++) @@ -644,6 +644,157 @@ public class SaneTsv return Encoding.UTF8.GetBytes(escapedString.ToString()); } + public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer) + { + string[] columnNames = null; + + var fieldBytes = new List(); + var fields = new List(); + var records = new List(); + + int numFields = -1; + int line = 1; + int currentLineStart = 0; + for (int i = 0; i < inputBuffer.Count(); i++) + { + if (inputBuffer[i] == '\\') + { + if (i + 1 == inputBuffer.Count()) + { + throw new Exception($"Found '\\' at end of input"); + } + if (inputBuffer[i + 1] == 'n') + { + fieldBytes.Add((byte)'\n'); + i++; + } + else if (inputBuffer[i + 1] == '\\') + { + fieldBytes.Add((byte)'\\'); + i++; + } + else if (inputBuffer[i + 1] == 't') + { + fieldBytes.Add((byte)'\t'); + i++; + } + else if (inputBuffer[i + 1] == '#') + { + fieldBytes.Add((byte)'#'); + i++; + } + else + { + throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}"); + } + } + else if (inputBuffer[i] == '\t') + { + // end of field + fields.Add(fieldBytes.ToArray()); + fieldBytes.Clear(); + } + else if (inputBuffer[i] == '\n') + { + fields.Add(fieldBytes.ToArray()); + fieldBytes.Clear(); + + if (numFields < 0) + { + // This is the header + + numFields = fields.Count; + + columnNames = new string[numFields]; + + for (int j = 0; j < fields.Count; j++) + { + string columnString; + try + { + columnString = Encoding.UTF8.GetString(fields[j]); + } + catch (Exception e) + { + throw new Exception($"Column name {fields.Count} is not valid UTF-8", e); + } + + if (columnString.Contains(':')) + { + throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); + } + + columnNames[j] = columnString; + } + + fields.Clear(); + } + else if (numFields != fields.Count) + { + throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); + } + else + { + var fieldStrings = new string[fields.Count]; + for (int j = 0; j < fields.Count; j++) + { + try + { + fieldStrings[j] = Encoding.UTF8.GetString(fields[j]); + } + catch (Exception e) + { + throw new Exception($"Line {line}, column {j} is not valid UTF-8", e); + } + } + records.Add(fieldStrings); + fields.Clear(); + } + + line++; + currentLineStart = i + 1; + } + else if (inputBuffer[i] == '#') + { + throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}"); + } + else + { + fieldBytes.Add(inputBuffer[i]); + } + } + + fields.Add(fieldBytes.ToArray()); + + if (numFields == 0) + { + throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); + } + if (numFields != fields.Count) + { + throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); + } + else + { + var fieldStrings = new string[fields.Count]; + for (int j = 0; j < fields.Count; j++) + { + try + { + fieldStrings[j] = Encoding.UTF8.GetString(fields[j]); + } + catch (Exception e) + { + throw new Exception($"Line {line}, column {j} is not valid UTF-8", e); + } + } + records.Add(fieldStrings); + fields.Clear(); + } + + return (columnNames, records.ToArray()); + } + public static Type GetColumnFromType(Type type) { if (type == typeof(string)) @@ -1082,11 +1233,11 @@ public class SaneTsv public string ColumnName { get; } public virtual Type ColumnType { get; } - public TsvColumnAttribute() + public TsvColumnAttribute() { ColumnType = typeof(StringType); } - public TsvColumnAttribute(string columnName) + public TsvColumnAttribute(string columnName) { ColumnType = typeof(StringType); ColumnName = columnName;