Add freeform ParseSimpleTsv

No column spec, just parse whatever you get
2024-02-22 23:16:35 -08:00
parent 695ad1f110
commit 0b213c98c2
1 changed files with 158 additions and 7 deletions
@@ -8,7 +8,7 @@ public class Tsv<T> where T : SaneTsv.TsvRecord
  public virtual List<T> Records { get; set; }
 }

-public class CommentedTsv<T>: Tsv<T> where T : SaneTsv.TsvRecord
+public class CommentedTsv<T> : Tsv<T> where T : SaneTsv.TsvRecord
 {
  public override List<T> Records { get; set; }
  public string FileComment { get; set; } = null;
@@ -355,7 +355,7 @@ public class SaneTsv
    return parsed;
  }

-  protected static T ParseCurrentCommentedRecord<T> (Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
+  protected static T ParseCurrentCommentedRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
  {
    return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
  }
@@ -644,6 +644,157 @@ public class SaneTsv
    return Encoding.UTF8.GetBytes(escapedString.ToString());
  }

+  public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
+  {
+    string[] columnNames = null;
+
+    var fieldBytes = new List<byte>();
+    var fields = new List<byte[]>();
+    var records = new List<string[]>();
+
+    int numFields = -1;
+    int line = 1;
+    int currentLineStart = 0;
+    for (int i = 0; i < inputBuffer.Count(); i++)
+    {
+      if (inputBuffer[i] == '\\')
+      {
+        if (i + 1 == inputBuffer.Count())
+        {
+          throw new Exception($"Found '\\' at end of input");
+        }
+        if (inputBuffer[i + 1] == 'n')
+        {
+          fieldBytes.Add((byte)'\n');
+          i++;
+        }
+        else if (inputBuffer[i + 1] == '\\')
+        {
+          fieldBytes.Add((byte)'\\');
+          i++;
+        }
+        else if (inputBuffer[i + 1] == 't')
+        {
+          fieldBytes.Add((byte)'\t');
+          i++;
+        }
+        else if (inputBuffer[i + 1] == '#')
+        {
+          fieldBytes.Add((byte)'#');
+          i++;
+        }
+        else
+        {
+          throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
+        }
+      }
+      else if (inputBuffer[i] == '\t')
+      {
+        // end of field
+        fields.Add(fieldBytes.ToArray());
+        fieldBytes.Clear();
+      }
+      else if (inputBuffer[i] == '\n')
+      {
+        fields.Add(fieldBytes.ToArray());
+        fieldBytes.Clear();
+
+        if (numFields < 0)
+        {
+          // This is the header
+
+          numFields = fields.Count;
+
+          columnNames = new string[numFields];
+
+          for (int j = 0; j < fields.Count; j++)
+          {
+            string columnString;
+            try
+            {
+              columnString = Encoding.UTF8.GetString(fields[j]);
+            }
+            catch (Exception e)
+            {
+              throw new Exception($"Column name {fields.Count} is not valid UTF-8", e);
+            }
+
+            if (columnString.Contains(':'))
+            {
+              throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
+            }
+
+            columnNames[j] = columnString;
+          }
+
+          fields.Clear();
+        }
+        else if (numFields != fields.Count)
+        {
+          throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
+        }
+        else
+        {
+          var fieldStrings = new string[fields.Count];
+          for (int j = 0; j < fields.Count; j++)
+          {
+            try
+            {
+              fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
+            }
+            catch (Exception e)
+            {
+              throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
+            }
+          }
+          records.Add(fieldStrings);
+          fields.Clear();
+        }
+
+        line++;
+        currentLineStart = i + 1;
+      }
+      else if (inputBuffer[i] == '#')
+      {
+        throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
+      }
+      else
+      {
+        fieldBytes.Add(inputBuffer[i]);
+      }
+    }
+
+    fields.Add(fieldBytes.ToArray());
+
+    if (numFields == 0)
+    {
+      throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
+    }
+    if (numFields != fields.Count)
+    {
+      throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
+    }
+    else
+    {
+      var fieldStrings = new string[fields.Count];
+      for (int j = 0; j < fields.Count; j++)
+      {
+        try
+        {
+          fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
+        }
+        catch (Exception e)
+        {
+          throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
+        }
+      }
+      records.Add(fieldStrings);
+      fields.Clear();
+    }
+
+    return (columnNames, records.ToArray());
+  }
+
  public static Type GetColumnFromType(Type type)
  {
    if (type == typeof(string))