Minor format spec rewording

Add roadmap
Add ExtraTSV
2024-02-15 20:27:35 -08:00 · 2024-02-15 20:26:56 -08:00 · 2024-02-15 20:26:40 -08:00 · 2024-02-15 20:24:44 -08:00 · 2024-02-15 20:24:31 -08:00 · 2024-02-15 20:24:01 -08:00
11 changed files with 507 additions and 131 deletions
--- a/SaneTsv/ExtraTsv/ExtraTsv.cs
+++ b/SaneTsv/ExtraTsv/ExtraTsv.cs
@ -0,0 +1,125 @@
 using System.Globalization;
 using System.Text.RegularExpressions;
 namespace NathanMcRae;
 public class ExtraTsv : SaneTsv
 {
  public class Iso8601Type : ColumnType { }
  public class PhysicalUnitsType : ColumnType 
  {
    public string Units { get; }
    public PhysicalUnitsType(string Units) { }
  }
  public static readonly string[] ValidUnits =
  {
    "m",
    "s",
    "A",
    "K",
    "cd",
    "mol",
    "kg",
    "Hz",
    "rad",
    "sr",
    "N",
    "Pa",
    "J",
    "W",
    "C",
    "V",
    "F",
    "Ω",
    "S",
    "Wb",
    "T",
    "H",
    "°C",
    "lm",
    "lx",
    "Bq",
    "Gy",
    "Sv",
    "kat"
  };
  public static readonly int MajorVersion = 0;
  public static readonly int MinorVersion = 0;
  public static readonly int PatchVersion = 1;
  public static Regex VersionRegex = new Regex(@"^ ExtraTSV V(\d+)\.(\d+)\.(\d+)");
  public static ExtraTsv ParseExtraTsv(byte[] inputBuffer)
  {
    SaneTsv tsv = ParseCommentedTsv(inputBuffer);
    if (tsv.FileComment == null) {
      throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
    }
    Match match = VersionRegex.Match(tsv.FileComment);
    if (!match.Success)
    {
      throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
    }
    int fileMajorVersion = int.Parse(match.Groups[1].Value);
    if (fileMajorVersion != MajorVersion)
    {
      throw new Exception($"File has major version ({fileMajorVersion}) which is newer than this parser's version {MajorVersion}");
    }
    for (int i = 0; i < tsv.ColumnNames.Count(); i++)
    {
      string[] typeParts = tsv.ColumnNames[i].Split(":");
      if (typeParts[typeParts.Length - 1] == "iso8601" && tsv.ColumnTypes[i] == typeof(StringType))
      {
        string columnName = tsv.ColumnNames[i].Substring(0, tsv.ColumnNames[i].Length - ":iso8601".Length);
        tsv.ColumnNames[i] = columnName;
        tsv.ColumnTypes[i] = typeof(Iso8601Type);
      }
      // TODO: ISO8601 time spans
      // TODO: ISO8601 time durations
      else if (typeParts[typeParts.Length - 1] == "units" && (tsv.ColumnTypes[i] == typeof(Float64Type) || tsv.ColumnTypes[i] == typeof(Float32Type)))
      {
        if (typeParts.Count() > 1 && ValidUnits.Contains(typeParts[typeParts.Length - 2]))
        {
          // TODO: How to store type information since the ColumnTypes is of type Type?
        }
        else
        {
          throw new Exception($"Invalid units type '{typeParts[typeParts.Length - 2]}' for column {i}");
        }
      }
    }
    CultureInfo provider = CultureInfo.InvariantCulture;
    for (int i = 0; i < tsv.Records.Count; i++)
    {
      if (tsv.Records[i].Comment != null)
      {
        throw new Exception($"Line {tsv.Records[i].Line} has comment above it which is not allowed");
      }
      for (int j = 0; j < tsv.ColumnNames.Count(); j++)
      {
        if (tsv.ColumnTypes[j] == typeof(Iso8601Type))
        {
          if (!DateTime.TryParseExact((string)tsv.Records[i][j], "yyyy-MM-ddTHH:mm:ss.ffff", provider, DateTimeStyles.None, out DateTime parsed))
          {
            throw new Exception($"ISO 8601 timestamp format error on line {tsv.Records[i].Line}, field {j}");
          }
          tsv.Records[i].Fields[j] = parsed;
        }
      }
    }
    return (ExtraTsv)tsv;
  }
 }
--- a/SaneTsv/ExtraTsv/ExtraTsv.csproj
+++ b/SaneTsv/ExtraTsv/ExtraTsv.csproj
@ -0,0 +1,14 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>net6.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
    <RootNamespace>NathanMcRae</RootNamespace>
  </PropertyGroup>
  <ItemGroup>
    <ProjectReference Include="..\SaneTsv.csproj" />
  </ItemGroup>
 </Project>
--- a/SaneTsv/ExtraTsv/readme.md
+++ b/SaneTsv/ExtraTsv/readme.md
@ -0,0 +1,44 @@
 Extra TSV adds many convenience types to Sane TSV:
 - Timestamps
  Just this format for now: yyyy-MM-ddTHH:mm:ss.ffff
 - Timespans
 - Time durations
 - Multiformats
  - Multihashes
  - Multiprotocols
  - ...
 - Physical units
  To start with, just use SI base and derived units
  - Base units
    - m
    - s
    - A
    - K
    - cd
    - mol
    - kg
  - Derived units
    - Hz
    - rad
    - sr
    - N
    - Pa
    - J
    - W
    - C
    - V
    - F
    - Ω
    - S
    - Wb
    - T
    - H
    - °C
    - lm
    - lx
    - Bq
    - Gy
    - Sv
    - kat
  How to handle derived units?
--- a/SaneTsv/ExtraTsvTest/ExtraTsvTest.csproj
+++ b/SaneTsv/ExtraTsvTest/ExtraTsvTest.csproj
@ -0,0 +1,14 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net6.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>
  <ItemGroup>
    <ProjectReference Include="..\ExtraTsv\ExtraTsv.csproj" />
  </ItemGroup>
 </Project>
--- a/SaneTsv/ExtraTsvTest/Program.cs
+++ b/SaneTsv/ExtraTsvTest/Program.cs
@ -0,0 +1,20 @@
 using NathanMcRae;
 using System.Text;
 {
  string testName = "Bool test";
  string testString1 = "# ExtraTSV V0.0.1\n" +
    "column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:iso8601:string" +
    "\nTRUE\tvalue\\\\t\0woo\t2024-02-15T18:03:30.0000" +
    "\nFALSE\tnother\t2024-02-15T18:03:39.0001";
  ExtraTsv parsed = ExtraTsv.ParseExtraTsv(Encoding.UTF8.GetBytes(testString1));
  if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
  {
    Console.WriteLine($"Passed {testName}");
  }
  else
  {
    Console.WriteLine($"Failed {testName}");
  }
 }
--- a/SaneTsv/SaneTsv.cs
+++ b/SaneTsv/SaneTsv.cs
@ -7,36 +7,34 @@ namespace NathanMcRae;
 /// </summary>
 public class SaneTsv
 {
-  public enum ColumnType
+  // Like an enum, but more extensible
-  {
+  public class ColumnType { }
-    STRING,
+  public class StringType : ColumnType { }
-    BOOLEAN,
+  public class BooleanType : ColumnType { }
-    FLOAT32,
+  public class Float32Type : ColumnType { }
-    FLOAT64,
+  public class Float64Type : ColumnType { }
-    UINT32,
+  public class UInt32Type : ColumnType { }
-    UINT64,
+  public class UInt64Type : ColumnType { }
-    INT32,
+  public class Int32Type : ColumnType { }
-    INT64,
+  public class Int64Type : ColumnType { }
-    BINARY,
+  public class BinaryType : ColumnType { }
  }
  protected enum FormatType
  {
-    SANE_TSV = 0,
+    SIMPLE_TSV = 0,
    TYPED_TSV = 1,
    COMMENTED_TSV = 2,
  }
  // TODO: We need to be able to update all these in tandem somehow
  public string[] ColumnNames { get; protected set; }
-  public ColumnType[] ColumnTypes { get; protected set; }
+  public Type[] ColumnTypes { get; protected set; }
  public Dictionary<string, List<object>> Columns { get; protected set; }
  public List<SaneTsvRecord> Records { get; protected set; }
  public string FileComment { get; protected set; } = null;
-  public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
+  public static SaneTsv ParseSimpleTsv(byte[] inputBuffer)
  {
-    return Parse(inputBuffer, FormatType.SANE_TSV);
+    return Parse(inputBuffer, FormatType.SIMPLE_TSV);
  }
  public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
@ -53,9 +51,8 @@ public class SaneTsv
  protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
  {
    var parsed = new SaneTsv();
    parsed.Columns = new Dictionary<string, List<object>>();
    parsed.ColumnNames = new string[] { };
-    parsed.ColumnTypes = new ColumnType[] { };
+    parsed.ColumnTypes = new Type[] { };
    parsed.Records = new List<SaneTsvRecord>();
    var fieldBytes = new List<byte>();
@ -95,7 +92,7 @@ public class SaneTsv
        }
        else
        {
-          throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
+          throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
        }
      }
      else if (inputBuffer[i] == '\t')
@ -116,7 +113,7 @@ public class SaneTsv
          numFields = fields.Count;
          parsed.ColumnNames = new string[numFields];
-          parsed.ColumnTypes = new ColumnType[numFields];
+          parsed.ColumnTypes = new Type[numFields];
          int numTypesBlank = 0;
@ -135,7 +132,7 @@ public class SaneTsv
            string columnTypeString;
            string columnName;
            if (columnString.Contains(':')) {
-              if (format == FormatType.SANE_TSV)
+              if (format == FormatType.SIMPLE_TSV)
              {
                throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
              }
@ -144,7 +141,7 @@ public class SaneTsv
            }
            else
            {
-              if (format > FormatType.SANE_TSV)
+              if (format > FormatType.SIMPLE_TSV)
              {
                throw new Exception($"Header {fields.Count} has no type");
              }
@ -152,53 +149,46 @@ public class SaneTsv
              columnName = columnString;
            }
-            ColumnType type;
+            Type type;
            switch (columnTypeString)
            {
              case "":
                numTypesBlank++;
-                type = ColumnType.STRING;
+                type = typeof(StringType);
                break;
              case "string":
-                type = ColumnType.STRING;
+                type = typeof(StringType);
                break;
              case "boolean":
-                type = ColumnType.BOOLEAN;
+                type = typeof(BooleanType);
                break;
              case "float32":
-                type = ColumnType.FLOAT32;
+                type = typeof(Float32Type);
                break;
              case "float64":
-                type = ColumnType.FLOAT64;
+                type = typeof(Float64Type);
                break;
              case "uint32":
-                type = ColumnType.UINT32;
+                type = typeof(UInt32Type);
                break;
              case "uint64":
-                type = ColumnType.UINT64;
+                type = typeof(UInt64Type);
                break;
              case "int32":
-                type = ColumnType.INT32;
+                type = typeof(Int32Type);
                break;
              case "int64":
-                type = ColumnType.INT64;
+                type = typeof(Int64Type);
                break;
              case "binary":
-                type = ColumnType.BINARY;
+                type = typeof(BinaryType);
                break;
              default:
                throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
            }
-            try
+            // TODO: Check column name uniqueness
            {
              parsed.Columns.Add(columnName, new List<object>());
            }
            catch (Exception e)
            {
              throw new Exception($"Column name {columnName} is not unique", e);
            }
            parsed.ColumnNames[j] = columnName;
            parsed.ColumnTypes[j] = type;
@ -224,7 +214,7 @@ public class SaneTsv
            comment = currentComment.ToString();
            currentComment.Clear();
          }
-          parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
+          parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
          fields.Clear();
        }
@ -241,8 +231,11 @@ public class SaneTsv
          {
            var commentBytes = new byte[j - i - 1];
            Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
            if (currentComment.Length > 0)
            {
              currentComment.Append('\n');
            }
            currentComment.Append(Encoding.UTF8.GetString(commentBytes));
            currentComment.Append("\n");
            i = j;
            currentLineStart = i + 1;
            line++;
@ -281,23 +274,25 @@ public class SaneTsv
        comment = currentComment.ToString();
        currentComment.Clear();
      }
-      parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
+      parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
      fields.Clear();
    }
    return parsed;
  }
  /// <summary>
  /// Note: this modifies 'parsed'
  /// </summary>
  protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
  {
    var parsedFields = new object[fields.Count];
    for (int j = 0; j < fields.Count; j++)
    {
      // All other types require the content to be UTF-8. Binary fields can ignore that.
-      if (parsed.ColumnTypes[j] == ColumnType.BINARY)
+      if (parsed.ColumnTypes[j] == typeof(BinaryType))
      {
        parsedFields[j] = fields[j];
        parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
        continue;
      }
@ -311,13 +306,14 @@ public class SaneTsv
        throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
      }
-      switch (parsed.ColumnTypes[j])
+      // TODO: Add checking for numeric types format
      if (parsed.ColumnTypes[j] == typeof(StringType))
      {
        case ColumnType.STRING:
        parsedFields[j] = fieldString;
-          parsed.Columns[parsed.ColumnNames[j]].Add(fieldString);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(BooleanType))
-        case ColumnType.BOOLEAN:
+      {
        bool parsedBool;
        if (fieldString == "TRUE")
        {
@ -333,65 +329,63 @@ public class SaneTsv
        }
        parsedFields[j] = parsedBool;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(Float32Type))
-        case ColumnType.FLOAT32:
+      {
        if (!float.TryParse(fieldString, out float parsedFloat))
        {
          throw new Exception($"Field {j} on line {line} is not valid single-precision float");
        }
        parsedFields[j] = parsedFloat;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(Float64Type))
-        case ColumnType.FLOAT64:
+      {
        if (!double.TryParse(fieldString, out double parsedDouble))
        {
          throw new Exception($"Field {j} on line {line} is not valid double-precision float");
        }
        parsedFields[j] = parsedDouble;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(UInt32Type))
-        case ColumnType.UINT32:
+      {
        if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
        {
          throw new Exception($"Field {j} on line {line} is not valid UInt32");
        }
        parsedFields[j] = parsedUInt32;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(UInt64Type))
-        case ColumnType.UINT64:
+      {
        if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
        {
          throw new Exception($"Field {j} on line {line} is not valid UInt64");
        }
        parsedFields[j] = parsedUInt64;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(Int32Type))
-        case ColumnType.INT32:
+      {
        if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
        {
          throw new Exception($"Field {j} on line {line} is not valid Int32");
        }
        parsedFields[j] = parsedInt32;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32);
+      }
-          break;
+      else if (parsed.ColumnTypes[j] == typeof(Int64Type))
-        case ColumnType.INT64:
+      {
        if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
        {
          throw new Exception($"Field {j} on line {line} is not valid Int64");
        }
        parsedFields[j] = parsedInt64;
-          parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64);
+      }
-          break;
+      else
-        case ColumnType.BINARY:
+      {
          throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
        default:
        throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
      }
    }
@ -399,6 +393,103 @@ public class SaneTsv
    return parsedFields;
  }
  public static byte[] SerializeSimpleTsv(IList<string> header, IList<IList<string>> data)
  {
    var escapedString = new StringBuilder();
    // Serialize header
    for (int i = 0; i < header.Count; i++)
    {
      if (header[i].Contains(':'))
      {
        throw new Exception($"Column {i} contains the character ':'");
      }
      for (int j = i + 1; j < header.Count; j++)
      {
        if (header[i] == header[j])
        {
          throw new Exception("Column names in header must be unique");
        }
      }
      for (int j = 0; j < header[i].Count(); j++)
      {
        if (header[i][j] == '\n')
        {
          escapedString.Append("\\n");
        }
        else if (header[i][j] == '\t')
        {
          escapedString.Append("\\t");
        }
        else if (header[i][j] == '\\')
        {
          escapedString.Append("\\\\");
        }
        else if (header[i][j] == '#')
        {
          escapedString.Append("\\#");
        }
        else
        {
          escapedString.Append(header[i][j]);
        }
      }
      if (i == header.Count - 1)
      {
        escapedString.Append('\n');
      }
      else
      {
        escapedString.Append('\t');
      }
    }
    // Serialize data
    for (int i = 0; i < data.Count; i++)
    { 
      for (int j = 0; j < data[i].Count; j++)
      {
        for (int k = 0; k < data[i][j].Length; k++)
        {
          if (data[i][j][k] == '\n')
          {
            escapedString.Append("\\n");
          }
          else if (data[i][j][k] == '\t')
          {
            escapedString.Append("\\t");
          }
          else if (data[i][j][k] == '\\')
          {
            escapedString.Append("\\\\");
          }
          else if (data[i][j][k] == '#')
          {
            escapedString.Append("\\#");
          }
          else
          {
            escapedString.Append(data[i][j][k]);
          }
        }
        if (j < data[i].Count - 1)
        {
          escapedString.Append('\t');
        }
        else if (i < data.Count - 1)
        {
          escapedString.Append('\n');
        }
      }
    }
    return Encoding.UTF8.GetBytes(escapedString.ToString());
  }
  public SaneTsvRecord this[int i] => Records[i];
  public class SaneTsvRecord
@ -406,14 +497,18 @@ public class SaneTsv
    public SaneTsv Parent { get; }
    public string Comment { get; }
    public object[] Fields { get; }
    public int Line { get; }
    public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
-    public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
+    public object this[int columnIndex] => Fields[columnIndex];
    public SaneTsvRecord(SaneTsv parent, object[] fields, string comment, int line)
    {
      Parent = parent;
      Fields = fields;
      Comment = comment;
      Line = line;
    }
  }
 }
--- a/SaneTsv/SaneTsv.csproj
+++ b/SaneTsv/SaneTsv.csproj
@ -10,8 +10,14 @@
  </PropertyGroup>
  <ItemGroup>
    <Compile Remove="ExtraTsvTest\**" />
    <Compile Remove="ExtraTsv\**" />
    <Compile Remove="SaneTsvTest\**" />
    <EmbeddedResource Remove="ExtraTsvTest\**" />
    <EmbeddedResource Remove="ExtraTsv\**" />
    <EmbeddedResource Remove="SaneTsvTest\**" />
    <None Remove="ExtraTsvTest\**" />
    <None Remove="ExtraTsv\**" />
    <None Remove="SaneTsvTest\**" />
  </ItemGroup>
--- a/SaneTsv/SaneTsv.sln
+++ b/SaneTsv/SaneTsv.sln
@ -5,7 +5,11 @@ VisualStudioVersion = 17.7.34024.191
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ExtraTsv", "ExtraTsv\ExtraTsv.csproj", "{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ExtraTsvTest", "ExtraTsvTest\ExtraTsvTest.csproj", "{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -21,6 +25,14 @@ Global
 		{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU
 		{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.Build.0 = Release|Any CPU
 		{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/SaneTsv/SaneTsvTest/Program.cs
+++ b/SaneTsv/SaneTsvTest/Program.cs
@ -39,7 +39,7 @@ using System.Text;
 {
  string testName = "Comment test";
  string testString1 = "#This is a file comment\n" +
-    " #One more file comment line\n" +
+    "#One more file comment line\n" +
    "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
    "\n#This is a comment" +
    "\n#Another comment line" +
@ -49,4 +49,26 @@ using System.Text;
  SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1));
 }
 {
  string testName = "Serde test";
  string testString1 = "column1\tcolumn2\tcolumnthree\\nyep" +
    "\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
    "\nFALSE\tnother\tno\\ther";
  SaneTsv parsed = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
  string serialized = Encoding.UTF8.GetString(SaneTsv.SerializeSimpleTsv(parsed.ColumnNames, parsed.Records.Select(r => r.Fields.Select(f => f.ToString()).ToArray()).ToArray()));
  if (testString1 == serialized)
  {
    Console.WriteLine($"Passed {testName}");
  }
  else
  {
    Console.WriteLine($"Failed {testName}");
  }
 }
 Console.WriteLine("Done with tests");
 // TODO: Check qNaN, sNaN, +inf, -inf values for float types
--- a/SaneTsv/readme.md
+++ b/SaneTsv/readme.md
@ -1,6 +1,10 @@
 # Sane TSV
-Sane TSV is a strict format for tabular data.
+Sane Tab-Separate Values is a series of tabular formats as an alternative to the under-specified TSV / CSV quagmire.
 # Simple TSV
 Simple TSV is a strict format for tabular data.
 '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
@ -12,7 +16,7 @@ Empty fields (i.e. two subsequent '\t' characters) are allowed.
 The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
-All lines in the file must have the same number of fields.
+All lines in the file must have the same number of fields as are in the header.
 The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
@ -20,7 +24,7 @@ Implementations of the format do not need to handle file reading and writing dir
 # Typed TSV
-Typed TSV builds on Sane TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
+Typed TSV builds on Simple TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
 - 'string'
 - 'boolean'
@ -34,7 +38,7 @@ Typed TSV builds on Sane TSV to allow for typing of columns. All column names in
 Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
-All fields in the rest of the file must be of the type corresponding the their column.
+All fields in the rest of the file must be of the type corresponding to their column.
 Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
@ -49,13 +53,17 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
 - 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
 - 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
 Binary fields are left as-is (after unescaping is performed).
 Typed TSV files should have the .ytsv extension (.ttsv is already used).
 # Commented TSV
-Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format.
+Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. The formats are kept distinct so that some applications can take advantage of the extra flexibility comments allow, while others can stick with the more restricted Typed TSV format.
-Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Any unescaped '#' after the start of a line are errors.
+Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Note that the '#' character is excluded from the comment data.
 Multiple consecutive comment lines are considered a single comment, with each line separated by a '\n'.
 Comments must be UTF-8 encoded text.
@ -78,7 +86,7 @@ Note that extended formats must remain parseable by baseline parsers, hence we m
 Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header.
-Extended formats may still use the .ctsv extension, though they could use a dedicated one as well.
+Extended formats may still use the .ctsv extension, though they could use a dedicated one instead.
 ## Ideas for Extension
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,16 @@
 # Sane TSV
 ## Roadmap
 - Improve error reporting by including line/column information in exceptions
 - Come up with a static-typing interface
  Something that doesn't require an array of objects
 - Check numeric formatting matches spec
 - Do parallel parsing / serializing implementation
  - Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in.
 - More optimization and making parsing modular:
  - Have callbacks for header parsing and field parsing
  - That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again.
 - Finish ExtraTSV implementation
 - Do zig implementation
 - Make a c interface from that
Author	SHA1	Message	Date
Nathan McRae	0b302734e9	Minor format spec rewording	2024-02-15 20:27:35 -08:00
Nathan McRae	7bc553905d	Add roadmap	2024-02-15 20:26:56 -08:00
Nathan McRae	932fbd553a	Add ExtraTSV	2024-02-15 20:26:40 -08:00
Nathan McRae	93f2e2ea5b	Make comments not end with \n	2024-02-15 20:24:44 -08:00
Nathan McRae	99766f99a6	Document that ParseCurrentRecord modifies parsed	2024-02-15 20:24:31 -08:00
Nathan McRae	a5eedef36b	Add Line field to SaneTsvRecord So extending formats can have line information	2024-02-15 20:24:01 -08:00
Nathan McRae	ee46c93ce1	Change column type implementation Use types instead of an enum so it's more easily extended.	2024-02-15 20:22:43 -08:00
Nathan McRae	83602391ab	Remove Columns from SaneTsv To simplify object manipulation for now. Note that this got rid of uniqueness checking for the current implementation.	2024-02-15 14:22:04 -08:00
Nathan McRae	725a5b2034	'Sane' -> 'Simple' for first format Change the naming so the overall family of formats is Sane TSV, while the simplest format is Simple TSV.	2024-02-15 12:52:27 -08:00
Nathan McRae	38d324738e	Add basic serialization	2024-02-15 11:57:45 -08:00