Compare commits

...

10 Commits

Author SHA1 Message Date
Nathan McRae
0b302734e9 Minor format spec rewording 2024-02-15 20:27:35 -08:00
Nathan McRae
7bc553905d Add roadmap 2024-02-15 20:26:56 -08:00
Nathan McRae
932fbd553a Add ExtraTSV 2024-02-15 20:26:40 -08:00
Nathan McRae
93f2e2ea5b Make comments not end with \n 2024-02-15 20:24:44 -08:00
Nathan McRae
99766f99a6 Document that ParseCurrentRecord modifies parsed 2024-02-15 20:24:31 -08:00
Nathan McRae
a5eedef36b Add Line field to SaneTsvRecord
So extending formats can have line information
2024-02-15 20:24:01 -08:00
Nathan McRae
ee46c93ce1 Change column type implementation
Use types instead of an enum so it's more easily extended.
2024-02-15 20:22:43 -08:00
Nathan McRae
83602391ab Remove Columns from SaneTsv
To simplify object manipulation for now. Note that this got rid of uniqueness checking
for the current implementation.
2024-02-15 14:22:04 -08:00
Nathan McRae
725a5b2034 'Sane' -> 'Simple' for first format
Change the naming so the overall family of formats is Sane TSV, while the simplest
format is Simple TSV.
2024-02-15 12:52:27 -08:00
Nathan McRae
38d324738e Add basic serialization 2024-02-15 11:57:45 -08:00
11 changed files with 507 additions and 131 deletions

View File

@ -0,0 +1,125 @@

using System.Globalization;
using System.Text.RegularExpressions;
namespace NathanMcRae;
public class ExtraTsv : SaneTsv
{
public class Iso8601Type : ColumnType { }
public class PhysicalUnitsType : ColumnType
{
public string Units { get; }
public PhysicalUnitsType(string Units) { }
}
public static readonly string[] ValidUnits =
{
"m",
"s",
"A",
"K",
"cd",
"mol",
"kg",
"Hz",
"rad",
"sr",
"N",
"Pa",
"J",
"W",
"C",
"V",
"F",
"Ω",
"S",
"Wb",
"T",
"H",
"°C",
"lm",
"lx",
"Bq",
"Gy",
"Sv",
"kat"
};
public static readonly int MajorVersion = 0;
public static readonly int MinorVersion = 0;
public static readonly int PatchVersion = 1;
public static Regex VersionRegex = new Regex(@"^ ExtraTSV V(\d+)\.(\d+)\.(\d+)");
public static ExtraTsv ParseExtraTsv(byte[] inputBuffer)
{
SaneTsv tsv = ParseCommentedTsv(inputBuffer);
if (tsv.FileComment == null) {
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
}
Match match = VersionRegex.Match(tsv.FileComment);
if (!match.Success)
{
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
}
int fileMajorVersion = int.Parse(match.Groups[1].Value);
if (fileMajorVersion != MajorVersion)
{
throw new Exception($"File has major version ({fileMajorVersion}) which is newer than this parser's version {MajorVersion}");
}
for (int i = 0; i < tsv.ColumnNames.Count(); i++)
{
string[] typeParts = tsv.ColumnNames[i].Split(":");
if (typeParts[typeParts.Length - 1] == "iso8601" && tsv.ColumnTypes[i] == typeof(StringType))
{
string columnName = tsv.ColumnNames[i].Substring(0, tsv.ColumnNames[i].Length - ":iso8601".Length);
tsv.ColumnNames[i] = columnName;
tsv.ColumnTypes[i] = typeof(Iso8601Type);
}
// TODO: ISO8601 time spans
// TODO: ISO8601 time durations
else if (typeParts[typeParts.Length - 1] == "units" && (tsv.ColumnTypes[i] == typeof(Float64Type) || tsv.ColumnTypes[i] == typeof(Float32Type)))
{
if (typeParts.Count() > 1 && ValidUnits.Contains(typeParts[typeParts.Length - 2]))
{
// TODO: How to store type information since the ColumnTypes is of type Type?
}
else
{
throw new Exception($"Invalid units type '{typeParts[typeParts.Length - 2]}' for column {i}");
}
}
}
CultureInfo provider = CultureInfo.InvariantCulture;
for (int i = 0; i < tsv.Records.Count; i++)
{
if (tsv.Records[i].Comment != null)
{
throw new Exception($"Line {tsv.Records[i].Line} has comment above it which is not allowed");
}
for (int j = 0; j < tsv.ColumnNames.Count(); j++)
{
if (tsv.ColumnTypes[j] == typeof(Iso8601Type))
{
if (!DateTime.TryParseExact((string)tsv.Records[i][j], "yyyy-MM-ddTHH:mm:ss.ffff", provider, DateTimeStyles.None, out DateTime parsed))
{
throw new Exception($"ISO 8601 timestamp format error on line {tsv.Records[i].Line}, field {j}");
}
tsv.Records[i].Fields[j] = parsed;
}
}
}
return (ExtraTsv)tsv;
}
}

View File

@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<RootNamespace>NathanMcRae</RootNamespace>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\SaneTsv.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,44 @@
Extra TSV adds many convenience types to Sane TSV:
- Timestamps
Just this format for now: yyyy-MM-ddTHH:mm:ss.ffff
- Timespans
- Time durations
- Multiformats
- Multihashes
- Multiprotocols
- ...
- Physical units
To start with, just use SI base and derived units
- Base units
- m
- s
- A
- K
- cd
- mol
- kg
- Derived units
- Hz
- rad
- sr
- N
- Pa
- J
- W
- C
- V
- F
- Ω
- S
- Wb
- T
- H
- °C
- lm
- lx
- Bq
- Gy
- Sv
- kat
How to handle derived units?

View File

@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\ExtraTsv\ExtraTsv.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,20 @@
using NathanMcRae;
using System.Text;
{
string testName = "Bool test";
string testString1 = "# ExtraTSV V0.0.1\n" +
"column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:iso8601:string" +
"\nTRUE\tvalue\\\\t\0woo\t2024-02-15T18:03:30.0000" +
"\nFALSE\tnother\t2024-02-15T18:03:39.0001";
ExtraTsv parsed = ExtraTsv.ParseExtraTsv(Encoding.UTF8.GetBytes(testString1));
if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}

View File

@ -7,36 +7,34 @@ namespace NathanMcRae;
/// </summary> /// </summary>
public class SaneTsv public class SaneTsv
{ {
public enum ColumnType // Like an enum, but more extensible
{ public class ColumnType { }
STRING, public class StringType : ColumnType { }
BOOLEAN, public class BooleanType : ColumnType { }
FLOAT32, public class Float32Type : ColumnType { }
FLOAT64, public class Float64Type : ColumnType { }
UINT32, public class UInt32Type : ColumnType { }
UINT64, public class UInt64Type : ColumnType { }
INT32, public class Int32Type : ColumnType { }
INT64, public class Int64Type : ColumnType { }
BINARY, public class BinaryType : ColumnType { }
}
protected enum FormatType protected enum FormatType
{ {
SANE_TSV = 0, SIMPLE_TSV = 0,
TYPED_TSV = 1, TYPED_TSV = 1,
COMMENTED_TSV = 2, COMMENTED_TSV = 2,
} }
// TODO: We need to be able to update all these in tandem somehow // TODO: We need to be able to update all these in tandem somehow
public string[] ColumnNames { get; protected set; } public string[] ColumnNames { get; protected set; }
public ColumnType[] ColumnTypes { get; protected set; } public Type[] ColumnTypes { get; protected set; }
public Dictionary<string, List<object>> Columns { get; protected set; }
public List<SaneTsvRecord> Records { get; protected set; } public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null; public string FileComment { get; protected set; } = null;
public static SaneTsv ParseSaneTsv(byte[] inputBuffer) public static SaneTsv ParseSimpleTsv(byte[] inputBuffer)
{ {
return Parse(inputBuffer, FormatType.SANE_TSV); return Parse(inputBuffer, FormatType.SIMPLE_TSV);
} }
public static SaneTsv ParseTypedTsv(byte[] inputBuffer) public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
@ -53,9 +51,8 @@ public class SaneTsv
protected static SaneTsv Parse(byte[] inputBuffer, FormatType format) protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
{ {
var parsed = new SaneTsv(); var parsed = new SaneTsv();
parsed.Columns = new Dictionary<string, List<object>>();
parsed.ColumnNames = new string[] { }; parsed.ColumnNames = new string[] { };
parsed.ColumnTypes = new ColumnType[] { }; parsed.ColumnTypes = new Type[] { };
parsed.Records = new List<SaneTsvRecord>(); parsed.Records = new List<SaneTsvRecord>();
var fieldBytes = new List<byte>(); var fieldBytes = new List<byte>();
@ -95,7 +92,7 @@ public class SaneTsv
} }
else else
{ {
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}"); throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
} }
} }
else if (inputBuffer[i] == '\t') else if (inputBuffer[i] == '\t')
@ -116,7 +113,7 @@ public class SaneTsv
numFields = fields.Count; numFields = fields.Count;
parsed.ColumnNames = new string[numFields]; parsed.ColumnNames = new string[numFields];
parsed.ColumnTypes = new ColumnType[numFields]; parsed.ColumnTypes = new Type[numFields];
int numTypesBlank = 0; int numTypesBlank = 0;
@ -135,7 +132,7 @@ public class SaneTsv
string columnTypeString; string columnTypeString;
string columnName; string columnName;
if (columnString.Contains(':')) { if (columnString.Contains(':')) {
if (format == FormatType.SANE_TSV) if (format == FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
} }
@ -144,7 +141,7 @@ public class SaneTsv
} }
else else
{ {
if (format > FormatType.SANE_TSV) if (format > FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} has no type"); throw new Exception($"Header {fields.Count} has no type");
} }
@ -152,53 +149,46 @@ public class SaneTsv
columnName = columnString; columnName = columnString;
} }
ColumnType type; Type type;
switch (columnTypeString) switch (columnTypeString)
{ {
case "": case "":
numTypesBlank++; numTypesBlank++;
type = ColumnType.STRING; type = typeof(StringType);
break; break;
case "string": case "string":
type = ColumnType.STRING; type = typeof(StringType);
break; break;
case "boolean": case "boolean":
type = ColumnType.BOOLEAN; type = typeof(BooleanType);
break; break;
case "float32": case "float32":
type = ColumnType.FLOAT32; type = typeof(Float32Type);
break; break;
case "float64": case "float64":
type = ColumnType.FLOAT64; type = typeof(Float64Type);
break; break;
case "uint32": case "uint32":
type = ColumnType.UINT32; type = typeof(UInt32Type);
break; break;
case "uint64": case "uint64":
type = ColumnType.UINT64; type = typeof(UInt64Type);
break; break;
case "int32": case "int32":
type = ColumnType.INT32; type = typeof(Int32Type);
break; break;
case "int64": case "int64":
type = ColumnType.INT64; type = typeof(Int64Type);
break; break;
case "binary": case "binary":
type = ColumnType.BINARY; type = typeof(BinaryType);
break; break;
default: default:
throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
} }
try // TODO: Check column name uniqueness
{
parsed.Columns.Add(columnName, new List<object>());
}
catch (Exception e)
{
throw new Exception($"Column name {columnName} is not unique", e);
}
parsed.ColumnNames[j] = columnName; parsed.ColumnNames[j] = columnName;
parsed.ColumnTypes[j] = type; parsed.ColumnTypes[j] = type;
@ -224,7 +214,7 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment)); parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
fields.Clear(); fields.Clear();
} }
@ -241,8 +231,11 @@ public class SaneTsv
{ {
var commentBytes = new byte[j - i - 1]; var commentBytes = new byte[j - i - 1];
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1); Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
if (currentComment.Length > 0)
{
currentComment.Append('\n');
}
currentComment.Append(Encoding.UTF8.GetString(commentBytes)); currentComment.Append(Encoding.UTF8.GetString(commentBytes));
currentComment.Append("\n");
i = j; i = j;
currentLineStart = i + 1; currentLineStart = i + 1;
line++; line++;
@ -281,23 +274,25 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment)); parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
fields.Clear(); fields.Clear();
} }
return parsed; return parsed;
} }
/// <summary>
/// Note: this modifies 'parsed'
/// </summary>
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line) protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
{ {
var parsedFields = new object[fields.Count]; var parsedFields = new object[fields.Count];
for (int j = 0; j < fields.Count; j++) for (int j = 0; j < fields.Count; j++)
{ {
// All other types require the content to be UTF-8. Binary fields can ignore that. // All other types require the content to be UTF-8. Binary fields can ignore that.
if (parsed.ColumnTypes[j] == ColumnType.BINARY) if (parsed.ColumnTypes[j] == typeof(BinaryType))
{ {
parsedFields[j] = fields[j]; parsedFields[j] = fields[j];
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
continue; continue;
} }
@ -311,13 +306,14 @@ public class SaneTsv
throw new Exception($"Field {j} on line {line} is not valid UTF-8", e); throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
} }
switch (parsed.ColumnTypes[j]) // TODO: Add checking for numeric types format
if (parsed.ColumnTypes[j] == typeof(StringType))
{ {
case ColumnType.STRING:
parsedFields[j] = fieldString; parsedFields[j] = fieldString;
parsed.Columns[parsed.ColumnNames[j]].Add(fieldString); }
break; else if (parsed.ColumnTypes[j] == typeof(BooleanType))
case ColumnType.BOOLEAN: {
bool parsedBool; bool parsedBool;
if (fieldString == "TRUE") if (fieldString == "TRUE")
{ {
@ -333,65 +329,63 @@ public class SaneTsv
} }
parsedFields[j] = parsedBool; parsedFields[j] = parsedBool;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool); }
break; else if (parsed.ColumnTypes[j] == typeof(Float32Type))
case ColumnType.FLOAT32: {
if (!float.TryParse(fieldString, out float parsedFloat)) if (!float.TryParse(fieldString, out float parsedFloat))
{ {
throw new Exception($"Field {j} on line {line} is not valid single-precision float"); throw new Exception($"Field {j} on line {line} is not valid single-precision float");
} }
parsedFields[j] = parsedFloat; parsedFields[j] = parsedFloat;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat); }
break; else if (parsed.ColumnTypes[j] == typeof(Float64Type))
case ColumnType.FLOAT64: {
if (!double.TryParse(fieldString, out double parsedDouble)) if (!double.TryParse(fieldString, out double parsedDouble))
{ {
throw new Exception($"Field {j} on line {line} is not valid double-precision float"); throw new Exception($"Field {j} on line {line} is not valid double-precision float");
} }
parsedFields[j] = parsedDouble; parsedFields[j] = parsedDouble;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble); }
break; else if (parsed.ColumnTypes[j] == typeof(UInt32Type))
case ColumnType.UINT32: {
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
{ {
throw new Exception($"Field {j} on line {line} is not valid UInt32"); throw new Exception($"Field {j} on line {line} is not valid UInt32");
} }
parsedFields[j] = parsedUInt32; parsedFields[j] = parsedUInt32;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32); }
break; else if (parsed.ColumnTypes[j] == typeof(UInt64Type))
case ColumnType.UINT64: {
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
{ {
throw new Exception($"Field {j} on line {line} is not valid UInt64"); throw new Exception($"Field {j} on line {line} is not valid UInt64");
} }
parsedFields[j] = parsedUInt64; parsedFields[j] = parsedUInt64;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64); }
break; else if (parsed.ColumnTypes[j] == typeof(Int32Type))
case ColumnType.INT32: {
if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
{ {
throw new Exception($"Field {j} on line {line} is not valid Int32"); throw new Exception($"Field {j} on line {line} is not valid Int32");
} }
parsedFields[j] = parsedInt32; parsedFields[j] = parsedInt32;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32); }
break; else if (parsed.ColumnTypes[j] == typeof(Int64Type))
case ColumnType.INT64: {
if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
{ {
throw new Exception($"Field {j} on line {line} is not valid Int64"); throw new Exception($"Field {j} on line {line} is not valid Int64");
} }
parsedFields[j] = parsedInt64; parsedFields[j] = parsedInt64;
parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64); }
break; else
case ColumnType.BINARY: {
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
default:
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
} }
} }
@ -399,6 +393,103 @@ public class SaneTsv
return parsedFields; return parsedFields;
} }
public static byte[] SerializeSimpleTsv(IList<string> header, IList<IList<string>> data)
{
var escapedString = new StringBuilder();
// Serialize header
for (int i = 0; i < header.Count; i++)
{
if (header[i].Contains(':'))
{
throw new Exception($"Column {i} contains the character ':'");
}
for (int j = i + 1; j < header.Count; j++)
{
if (header[i] == header[j])
{
throw new Exception("Column names in header must be unique");
}
}
for (int j = 0; j < header[i].Count(); j++)
{
if (header[i][j] == '\n')
{
escapedString.Append("\\n");
}
else if (header[i][j] == '\t')
{
escapedString.Append("\\t");
}
else if (header[i][j] == '\\')
{
escapedString.Append("\\\\");
}
else if (header[i][j] == '#')
{
escapedString.Append("\\#");
}
else
{
escapedString.Append(header[i][j]);
}
}
if (i == header.Count - 1)
{
escapedString.Append('\n');
}
else
{
escapedString.Append('\t');
}
}
// Serialize data
for (int i = 0; i < data.Count; i++)
{
for (int j = 0; j < data[i].Count; j++)
{
for (int k = 0; k < data[i][j].Length; k++)
{
if (data[i][j][k] == '\n')
{
escapedString.Append("\\n");
}
else if (data[i][j][k] == '\t')
{
escapedString.Append("\\t");
}
else if (data[i][j][k] == '\\')
{
escapedString.Append("\\\\");
}
else if (data[i][j][k] == '#')
{
escapedString.Append("\\#");
}
else
{
escapedString.Append(data[i][j][k]);
}
}
if (j < data[i].Count - 1)
{
escapedString.Append('\t');
}
else if (i < data.Count - 1)
{
escapedString.Append('\n');
}
}
}
return Encoding.UTF8.GetBytes(escapedString.ToString());
}
public SaneTsvRecord this[int i] => Records[i]; public SaneTsvRecord this[int i] => Records[i];
public class SaneTsvRecord public class SaneTsvRecord
@ -406,14 +497,18 @@ public class SaneTsv
public SaneTsv Parent { get; } public SaneTsv Parent { get; }
public string Comment { get; } public string Comment { get; }
public object[] Fields { get; } public object[] Fields { get; }
public int Line { get; }
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment) public object this[int columnIndex] => Fields[columnIndex];
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment, int line)
{ {
Parent = parent; Parent = parent;
Fields = fields; Fields = fields;
Comment = comment; Comment = comment;
Line = line;
} }
} }
} }

View File

@ -10,8 +10,14 @@
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Compile Remove="ExtraTsvTest\**" />
<Compile Remove="ExtraTsv\**" />
<Compile Remove="SaneTsvTest\**" /> <Compile Remove="SaneTsvTest\**" />
<EmbeddedResource Remove="ExtraTsvTest\**" />
<EmbeddedResource Remove="ExtraTsv\**" />
<EmbeddedResource Remove="SaneTsvTest\**" /> <EmbeddedResource Remove="SaneTsvTest\**" />
<None Remove="ExtraTsvTest\**" />
<None Remove="ExtraTsv\**" />
<None Remove="SaneTsvTest\**" /> <None Remove="SaneTsvTest\**" />
</ItemGroup> </ItemGroup>

View File

@ -5,7 +5,11 @@ VisualStudioVersion = 17.7.34024.191
MinimumVisualStudioVersion = 10.0.40219.1 MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}"
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ExtraTsv", "ExtraTsv\ExtraTsv.csproj", "{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ExtraTsvTest", "ExtraTsvTest\ExtraTsvTest.csproj", "{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}"
EndProject EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -21,6 +25,14 @@ Global
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.Build.0 = Release|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

View File

@ -39,7 +39,7 @@ using System.Text;
{ {
string testName = "Comment test"; string testName = "Comment test";
string testString1 = "#This is a file comment\n" + string testString1 = "#This is a file comment\n" +
" #One more file comment line\n" + "#One more file comment line\n" +
"column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\n#This is a comment" + "\n#This is a comment" +
"\n#Another comment line" + "\n#Another comment line" +
@ -49,4 +49,26 @@ using System.Text;
SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1)); SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1));
} }
{
string testName = "Serde test";
string testString1 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
SaneTsv parsed = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
string serialized = Encoding.UTF8.GetString(SaneTsv.SerializeSimpleTsv(parsed.ColumnNames, parsed.Records.Select(r => r.Fields.Select(f => f.ToString()).ToArray()).ToArray()));
if (testString1 == serialized)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
Console.WriteLine("Done with tests"); Console.WriteLine("Done with tests");
// TODO: Check qNaN, sNaN, +inf, -inf values for float types

View File

@ -1,6 +1,10 @@
# Sane TSV # Sane TSV
Sane TSV is a strict format for tabular data. Sane Tab-Separate Values is a series of tabular formats as an alternative to the under-specified TSV / CSV quagmire.
# Simple TSV
Simple TSV is a strict format for tabular data.
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line. '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
@ -12,7 +16,7 @@ Empty fields (i.e. two subsequent '\t' characters) are allowed.
The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)). The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
All lines in the file must have the same number of fields. All lines in the file must have the same number of fields as are in the header.
The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error. The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
@ -20,7 +24,7 @@ Implementations of the format do not need to handle file reading and writing dir
# Typed TSV # Typed TSV
Typed TSV builds on Sane TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types: Typed TSV builds on Simple TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
- 'string' - 'string'
- 'boolean' - 'boolean'
@ -34,7 +38,7 @@ Typed TSV builds on Sane TSV to allow for typing of columns. All column names in
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters. Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
All fields in the rest of the file must be of the type corresponding the their column. All fields in the rest of the file must be of the type corresponding to their column.
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions: Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
@ -49,13 +53,17 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*` - 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed) - 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
Binary fields are left as-is (after unescaping is performed).
Typed TSV files should have the .ytsv extension (.ttsv is already used). Typed TSV files should have the .ytsv extension (.ttsv is already used).
# Commented TSV # Commented TSV
Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format. Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. The formats are kept distinct so that some applications can take advantage of the extra flexibility comments allow, while others can stick with the more restricted Typed TSV format.
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Any unescaped '#' after the start of a line are errors. Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Note that the '#' character is excluded from the comment data.
Multiple consecutive comment lines are considered a single comment, with each line separated by a '\n'.
Comments must be UTF-8 encoded text. Comments must be UTF-8 encoded text.
@ -78,7 +86,7 @@ Note that extended formats must remain parseable by baseline parsers, hence we m
Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header. Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header.
Extended formats may still use the .ctsv extension, though they could use a dedicated one as well. Extended formats may still use the .ctsv extension, though they could use a dedicated one instead.
## Ideas for Extension ## Ideas for Extension

16
readme.md Normal file
View File

@ -0,0 +1,16 @@
# Sane TSV
## Roadmap
- Improve error reporting by including line/column information in exceptions
- Come up with a static-typing interface
Something that doesn't require an array of objects
- Check numeric formatting matches spec
- Do parallel parsing / serializing implementation
- Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in.
- More optimization and making parsing modular:
- Have callbacks for header parsing and field parsing
- That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again.
- Finish ExtraTSV implementation
- Do zig implementation
- Make a c interface from that