Compare commits

..

No commits in common. "0b302734e9bbeb6f54b90a4d2f31ca5593d29a3c" and "cc8a122b57a46dcf4ec66072f7a62932e9e0f7e5" have entirely different histories.

11 changed files with 132 additions and 508 deletions

View File

@ -1,125 +0,0 @@

using System.Globalization;
using System.Text.RegularExpressions;
namespace NathanMcRae;
public class ExtraTsv : SaneTsv
{
public class Iso8601Type : ColumnType { }
public class PhysicalUnitsType : ColumnType
{
public string Units { get; }
public PhysicalUnitsType(string Units) { }
}
public static readonly string[] ValidUnits =
{
"m",
"s",
"A",
"K",
"cd",
"mol",
"kg",
"Hz",
"rad",
"sr",
"N",
"Pa",
"J",
"W",
"C",
"V",
"F",
"Ω",
"S",
"Wb",
"T",
"H",
"°C",
"lm",
"lx",
"Bq",
"Gy",
"Sv",
"kat"
};
public static readonly int MajorVersion = 0;
public static readonly int MinorVersion = 0;
public static readonly int PatchVersion = 1;
public static Regex VersionRegex = new Regex(@"^ ExtraTSV V(\d+)\.(\d+)\.(\d+)");
public static ExtraTsv ParseExtraTsv(byte[] inputBuffer)
{
SaneTsv tsv = ParseCommentedTsv(inputBuffer);
if (tsv.FileComment == null) {
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
}
Match match = VersionRegex.Match(tsv.FileComment);
if (!match.Success)
{
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
}
int fileMajorVersion = int.Parse(match.Groups[1].Value);
if (fileMajorVersion != MajorVersion)
{
throw new Exception($"File has major version ({fileMajorVersion}) which is newer than this parser's version {MajorVersion}");
}
for (int i = 0; i < tsv.ColumnNames.Count(); i++)
{
string[] typeParts = tsv.ColumnNames[i].Split(":");
if (typeParts[typeParts.Length - 1] == "iso8601" && tsv.ColumnTypes[i] == typeof(StringType))
{
string columnName = tsv.ColumnNames[i].Substring(0, tsv.ColumnNames[i].Length - ":iso8601".Length);
tsv.ColumnNames[i] = columnName;
tsv.ColumnTypes[i] = typeof(Iso8601Type);
}
// TODO: ISO8601 time spans
// TODO: ISO8601 time durations
else if (typeParts[typeParts.Length - 1] == "units" && (tsv.ColumnTypes[i] == typeof(Float64Type) || tsv.ColumnTypes[i] == typeof(Float32Type)))
{
if (typeParts.Count() > 1 && ValidUnits.Contains(typeParts[typeParts.Length - 2]))
{
// TODO: How to store type information since the ColumnTypes is of type Type?
}
else
{
throw new Exception($"Invalid units type '{typeParts[typeParts.Length - 2]}' for column {i}");
}
}
}
CultureInfo provider = CultureInfo.InvariantCulture;
for (int i = 0; i < tsv.Records.Count; i++)
{
if (tsv.Records[i].Comment != null)
{
throw new Exception($"Line {tsv.Records[i].Line} has comment above it which is not allowed");
}
for (int j = 0; j < tsv.ColumnNames.Count(); j++)
{
if (tsv.ColumnTypes[j] == typeof(Iso8601Type))
{
if (!DateTime.TryParseExact((string)tsv.Records[i][j], "yyyy-MM-ddTHH:mm:ss.ffff", provider, DateTimeStyles.None, out DateTime parsed))
{
throw new Exception($"ISO 8601 timestamp format error on line {tsv.Records[i].Line}, field {j}");
}
tsv.Records[i].Fields[j] = parsed;
}
}
}
return (ExtraTsv)tsv;
}
}

View File

@ -1,14 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<RootNamespace>NathanMcRae</RootNamespace>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\SaneTsv.csproj" />
</ItemGroup>
</Project>

View File

@ -1,44 +0,0 @@
Extra TSV adds many convenience types to Sane TSV:
- Timestamps
Just this format for now: yyyy-MM-ddTHH:mm:ss.ffff
- Timespans
- Time durations
- Multiformats
- Multihashes
- Multiprotocols
- ...
- Physical units
To start with, just use SI base and derived units
- Base units
- m
- s
- A
- K
- cd
- mol
- kg
- Derived units
- Hz
- rad
- sr
- N
- Pa
- J
- W
- C
- V
- F
- Ω
- S
- Wb
- T
- H
- °C
- lm
- lx
- Bq
- Gy
- Sv
- kat
How to handle derived units?

View File

@ -1,14 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\ExtraTsv\ExtraTsv.csproj" />
</ItemGroup>
</Project>

View File

@ -1,20 +0,0 @@
using NathanMcRae;
using System.Text;
{
string testName = "Bool test";
string testString1 = "# ExtraTSV V0.0.1\n" +
"column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:iso8601:string" +
"\nTRUE\tvalue\\\\t\0woo\t2024-02-15T18:03:30.0000" +
"\nFALSE\tnother\t2024-02-15T18:03:39.0001";
ExtraTsv parsed = ExtraTsv.ParseExtraTsv(Encoding.UTF8.GetBytes(testString1));
if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}

View File

@ -7,34 +7,36 @@ namespace NathanMcRae;
/// </summary> /// </summary>
public class SaneTsv public class SaneTsv
{ {
// Like an enum, but more extensible public enum ColumnType
public class ColumnType { } {
public class StringType : ColumnType { } STRING,
public class BooleanType : ColumnType { } BOOLEAN,
public class Float32Type : ColumnType { } FLOAT32,
public class Float64Type : ColumnType { } FLOAT64,
public class UInt32Type : ColumnType { } UINT32,
public class UInt64Type : ColumnType { } UINT64,
public class Int32Type : ColumnType { } INT32,
public class Int64Type : ColumnType { } INT64,
public class BinaryType : ColumnType { } BINARY,
}
protected enum FormatType protected enum FormatType
{ {
SIMPLE_TSV = 0, SANE_TSV = 0,
TYPED_TSV = 1, TYPED_TSV = 1,
COMMENTED_TSV = 2, COMMENTED_TSV = 2,
} }
// TODO: We need to be able to update all these in tandem somehow // TODO: We need to be able to update all these in tandem somehow
public string[] ColumnNames { get; protected set; } public string[] ColumnNames { get; protected set; }
public Type[] ColumnTypes { get; protected set; } public ColumnType[] ColumnTypes { get; protected set; }
public Dictionary<string, List<object>> Columns { get; protected set; }
public List<SaneTsvRecord> Records { get; protected set; } public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null; public string FileComment { get; protected set; } = null;
public static SaneTsv ParseSimpleTsv(byte[] inputBuffer) public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
{ {
return Parse(inputBuffer, FormatType.SIMPLE_TSV); return Parse(inputBuffer, FormatType.SANE_TSV);
} }
public static SaneTsv ParseTypedTsv(byte[] inputBuffer) public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
@ -51,8 +53,9 @@ public class SaneTsv
protected static SaneTsv Parse(byte[] inputBuffer, FormatType format) protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
{ {
var parsed = new SaneTsv(); var parsed = new SaneTsv();
parsed.Columns = new Dictionary<string, List<object>>();
parsed.ColumnNames = new string[] { }; parsed.ColumnNames = new string[] { };
parsed.ColumnTypes = new Type[] { }; parsed.ColumnTypes = new ColumnType[] { };
parsed.Records = new List<SaneTsvRecord>(); parsed.Records = new List<SaneTsvRecord>();
var fieldBytes = new List<byte>(); var fieldBytes = new List<byte>();
@ -92,7 +95,7 @@ public class SaneTsv
} }
else else
{ {
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}"); throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
} }
} }
else if (inputBuffer[i] == '\t') else if (inputBuffer[i] == '\t')
@ -113,7 +116,7 @@ public class SaneTsv
numFields = fields.Count; numFields = fields.Count;
parsed.ColumnNames = new string[numFields]; parsed.ColumnNames = new string[numFields];
parsed.ColumnTypes = new Type[numFields]; parsed.ColumnTypes = new ColumnType[numFields];
int numTypesBlank = 0; int numTypesBlank = 0;
@ -132,7 +135,7 @@ public class SaneTsv
string columnTypeString; string columnTypeString;
string columnName; string columnName;
if (columnString.Contains(':')) { if (columnString.Contains(':')) {
if (format == FormatType.SIMPLE_TSV) if (format == FormatType.SANE_TSV)
{ {
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
} }
@ -141,7 +144,7 @@ public class SaneTsv
} }
else else
{ {
if (format > FormatType.SIMPLE_TSV) if (format > FormatType.SANE_TSV)
{ {
throw new Exception($"Header {fields.Count} has no type"); throw new Exception($"Header {fields.Count} has no type");
} }
@ -149,46 +152,53 @@ public class SaneTsv
columnName = columnString; columnName = columnString;
} }
Type type; ColumnType type;
switch (columnTypeString) switch (columnTypeString)
{ {
case "": case "":
numTypesBlank++; numTypesBlank++;
type = typeof(StringType); type = ColumnType.STRING;
break; break;
case "string": case "string":
type = typeof(StringType); type = ColumnType.STRING;
break; break;
case "boolean": case "boolean":
type = typeof(BooleanType); type = ColumnType.BOOLEAN;
break; break;
case "float32": case "float32":
type = typeof(Float32Type); type = ColumnType.FLOAT32;
break; break;
case "float64": case "float64":
type = typeof(Float64Type); type = ColumnType.FLOAT64;
break; break;
case "uint32": case "uint32":
type = typeof(UInt32Type); type = ColumnType.UINT32;
break; break;
case "uint64": case "uint64":
type = typeof(UInt64Type); type = ColumnType.UINT64;
break; break;
case "int32": case "int32":
type = typeof(Int32Type); type = ColumnType.INT32;
break; break;
case "int64": case "int64":
type = typeof(Int64Type); type = ColumnType.INT64;
break; break;
case "binary": case "binary":
type = typeof(BinaryType); type = ColumnType.BINARY;
break; break;
default: default:
throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
} }
// TODO: Check column name uniqueness try
{
parsed.Columns.Add(columnName, new List<object>());
}
catch (Exception e)
{
throw new Exception($"Column name {columnName} is not unique", e);
}
parsed.ColumnNames[j] = columnName; parsed.ColumnNames[j] = columnName;
parsed.ColumnTypes[j] = type; parsed.ColumnTypes[j] = type;
@ -214,7 +224,7 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line)); parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear(); fields.Clear();
} }
@ -231,11 +241,8 @@ public class SaneTsv
{ {
var commentBytes = new byte[j - i - 1]; var commentBytes = new byte[j - i - 1];
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1); Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
if (currentComment.Length > 0)
{
currentComment.Append('\n');
}
currentComment.Append(Encoding.UTF8.GetString(commentBytes)); currentComment.Append(Encoding.UTF8.GetString(commentBytes));
currentComment.Append("\n");
i = j; i = j;
currentLineStart = i + 1; currentLineStart = i + 1;
line++; line++;
@ -274,25 +281,23 @@ public class SaneTsv
comment = currentComment.ToString(); comment = currentComment.ToString();
currentComment.Clear(); currentComment.Clear();
} }
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line)); parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear(); fields.Clear();
} }
return parsed; return parsed;
} }
/// <summary>
/// Note: this modifies 'parsed'
/// </summary>
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line) protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
{ {
var parsedFields = new object[fields.Count]; var parsedFields = new object[fields.Count];
for (int j = 0; j < fields.Count; j++) for (int j = 0; j < fields.Count; j++)
{ {
// All other types require the content to be UTF-8. Binary fields can ignore that. // All other types require the content to be UTF-8. Binary fields can ignore that.
if (parsed.ColumnTypes[j] == typeof(BinaryType)) if (parsed.ColumnTypes[j] == ColumnType.BINARY)
{ {
parsedFields[j] = fields[j]; parsedFields[j] = fields[j];
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
continue; continue;
} }
@ -306,14 +311,13 @@ public class SaneTsv
throw new Exception($"Field {j} on line {line} is not valid UTF-8", e); throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
} }
// TODO: Add checking for numeric types format switch (parsed.ColumnTypes[j])
if (parsed.ColumnTypes[j] == typeof(StringType))
{ {
case ColumnType.STRING:
parsedFields[j] = fieldString; parsedFields[j] = fieldString;
} parsed.Columns[parsed.ColumnNames[j]].Add(fieldString);
else if (parsed.ColumnTypes[j] == typeof(BooleanType)) break;
{ case ColumnType.BOOLEAN:
bool parsedBool; bool parsedBool;
if (fieldString == "TRUE") if (fieldString == "TRUE")
{ {
@ -329,63 +333,65 @@ public class SaneTsv
} }
parsedFields[j] = parsedBool; parsedFields[j] = parsedBool;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool);
else if (parsed.ColumnTypes[j] == typeof(Float32Type)) break;
{ case ColumnType.FLOAT32:
if (!float.TryParse(fieldString, out float parsedFloat)) if (!float.TryParse(fieldString, out float parsedFloat))
{ {
throw new Exception($"Field {j} on line {line} is not valid single-precision float"); throw new Exception($"Field {j} on line {line} is not valid single-precision float");
} }
parsedFields[j] = parsedFloat; parsedFields[j] = parsedFloat;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat);
else if (parsed.ColumnTypes[j] == typeof(Float64Type)) break;
{ case ColumnType.FLOAT64:
if (!double.TryParse(fieldString, out double parsedDouble)) if (!double.TryParse(fieldString, out double parsedDouble))
{ {
throw new Exception($"Field {j} on line {line} is not valid double-precision float"); throw new Exception($"Field {j} on line {line} is not valid double-precision float");
} }
parsedFields[j] = parsedDouble; parsedFields[j] = parsedDouble;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble);
else if (parsed.ColumnTypes[j] == typeof(UInt32Type)) break;
{ case ColumnType.UINT32:
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
{ {
throw new Exception($"Field {j} on line {line} is not valid UInt32"); throw new Exception($"Field {j} on line {line} is not valid UInt32");
} }
parsedFields[j] = parsedUInt32; parsedFields[j] = parsedUInt32;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32);
else if (parsed.ColumnTypes[j] == typeof(UInt64Type)) break;
{ case ColumnType.UINT64:
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64)) if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
{ {
throw new Exception($"Field {j} on line {line} is not valid UInt64"); throw new Exception($"Field {j} on line {line} is not valid UInt64");
} }
parsedFields[j] = parsedUInt64; parsedFields[j] = parsedUInt64;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64);
else if (parsed.ColumnTypes[j] == typeof(Int32Type)) break;
{ case ColumnType.INT32:
if (!Int32.TryParse(fieldString, out Int32 parsedInt32)) if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
{ {
throw new Exception($"Field {j} on line {line} is not valid Int32"); throw new Exception($"Field {j} on line {line} is not valid Int32");
} }
parsedFields[j] = parsedInt32; parsedFields[j] = parsedInt32;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32);
else if (parsed.ColumnTypes[j] == typeof(Int64Type)) break;
{ case ColumnType.INT64:
if (!Int64.TryParse(fieldString, out Int64 parsedInt64)) if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
{ {
throw new Exception($"Field {j} on line {line} is not valid Int64"); throw new Exception($"Field {j} on line {line} is not valid Int64");
} }
parsedFields[j] = parsedInt64; parsedFields[j] = parsedInt64;
} parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64);
else break;
{ case ColumnType.BINARY:
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
default:
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}"); throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
} }
} }
@ -393,103 +399,6 @@ public class SaneTsv
return parsedFields; return parsedFields;
} }
public static byte[] SerializeSimpleTsv(IList<string> header, IList<IList<string>> data)
{
var escapedString = new StringBuilder();
// Serialize header
for (int i = 0; i < header.Count; i++)
{
if (header[i].Contains(':'))
{
throw new Exception($"Column {i} contains the character ':'");
}
for (int j = i + 1; j < header.Count; j++)
{
if (header[i] == header[j])
{
throw new Exception("Column names in header must be unique");
}
}
for (int j = 0; j < header[i].Count(); j++)
{
if (header[i][j] == '\n')
{
escapedString.Append("\\n");
}
else if (header[i][j] == '\t')
{
escapedString.Append("\\t");
}
else if (header[i][j] == '\\')
{
escapedString.Append("\\\\");
}
else if (header[i][j] == '#')
{
escapedString.Append("\\#");
}
else
{
escapedString.Append(header[i][j]);
}
}
if (i == header.Count - 1)
{
escapedString.Append('\n');
}
else
{
escapedString.Append('\t');
}
}
// Serialize data
for (int i = 0; i < data.Count; i++)
{
for (int j = 0; j < data[i].Count; j++)
{
for (int k = 0; k < data[i][j].Length; k++)
{
if (data[i][j][k] == '\n')
{
escapedString.Append("\\n");
}
else if (data[i][j][k] == '\t')
{
escapedString.Append("\\t");
}
else if (data[i][j][k] == '\\')
{
escapedString.Append("\\\\");
}
else if (data[i][j][k] == '#')
{
escapedString.Append("\\#");
}
else
{
escapedString.Append(data[i][j][k]);
}
}
if (j < data[i].Count - 1)
{
escapedString.Append('\t');
}
else if (i < data.Count - 1)
{
escapedString.Append('\n');
}
}
}
return Encoding.UTF8.GetBytes(escapedString.ToString());
}
public SaneTsvRecord this[int i] => Records[i]; public SaneTsvRecord this[int i] => Records[i];
public class SaneTsvRecord public class SaneTsvRecord
@ -497,18 +406,14 @@ public class SaneTsv
public SaneTsv Parent { get; } public SaneTsv Parent { get; }
public string Comment { get; } public string Comment { get; }
public object[] Fields { get; } public object[] Fields { get; }
public int Line { get; }
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public object this[int columnIndex] => Fields[columnIndex]; public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment, int line)
{ {
Parent = parent; Parent = parent;
Fields = fields; Fields = fields;
Comment = comment; Comment = comment;
Line = line;
} }
} }
} }

View File

@ -10,14 +10,8 @@
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Compile Remove="ExtraTsvTest\**" />
<Compile Remove="ExtraTsv\**" />
<Compile Remove="SaneTsvTest\**" /> <Compile Remove="SaneTsvTest\**" />
<EmbeddedResource Remove="ExtraTsvTest\**" />
<EmbeddedResource Remove="ExtraTsv\**" />
<EmbeddedResource Remove="SaneTsvTest\**" /> <EmbeddedResource Remove="SaneTsvTest\**" />
<None Remove="ExtraTsvTest\**" />
<None Remove="ExtraTsv\**" />
<None Remove="SaneTsvTest\**" /> <None Remove="SaneTsvTest\**" />
</ItemGroup> </ItemGroup>

View File

@ -5,11 +5,7 @@ VisualStudioVersion = 17.7.34024.191
MinimumVisualStudioVersion = 10.0.40219.1 MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}"
EndProject EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}" Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ExtraTsv", "ExtraTsv\ExtraTsv.csproj", "{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ExtraTsvTest", "ExtraTsvTest\ExtraTsvTest.csproj", "{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}"
EndProject EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -25,14 +21,6 @@ Global
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU {43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.Build.0 = Release|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE

View File

@ -49,26 +49,4 @@ using System.Text;
SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1)); SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1));
} }
{
string testName = "Serde test";
string testString1 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
SaneTsv parsed = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
string serialized = Encoding.UTF8.GetString(SaneTsv.SerializeSimpleTsv(parsed.ColumnNames, parsed.Records.Select(r => r.Fields.Select(f => f.ToString()).ToArray()).ToArray()));
if (testString1 == serialized)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
Console.WriteLine("Done with tests"); Console.WriteLine("Done with tests");
// TODO: Check qNaN, sNaN, +inf, -inf values for float types

View File

@ -1,10 +1,6 @@
# Sane TSV # Sane TSV
Sane Tab-Separate Values is a series of tabular formats as an alternative to the under-specified TSV / CSV quagmire. Sane TSV is a strict format for tabular data.
# Simple TSV
Simple TSV is a strict format for tabular data.
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line. '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
@ -16,7 +12,7 @@ Empty fields (i.e. two subsequent '\t' characters) are allowed.
The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)). The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
All lines in the file must have the same number of fields as are in the header. All lines in the file must have the same number of fields.
The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error. The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
@ -24,7 +20,7 @@ Implementations of the format do not need to handle file reading and writing dir
# Typed TSV # Typed TSV
Typed TSV builds on Simple TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types: Typed TSV builds on Sane TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
- 'string' - 'string'
- 'boolean' - 'boolean'
@ -38,7 +34,7 @@ Typed TSV builds on Simple TSV to allow for typing of columns. All column names
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters. Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
All fields in the rest of the file must be of the type corresponding to their column. All fields in the rest of the file must be of the type corresponding the their column.
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions: Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
@ -53,17 +49,13 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*` - 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed) - 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
Binary fields are left as-is (after unescaping is performed).
Typed TSV files should have the .ytsv extension (.ttsv is already used). Typed TSV files should have the .ytsv extension (.ttsv is already used).
# Commented TSV # Commented TSV
Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. The formats are kept distinct so that some applications can take advantage of the extra flexibility comments allow, while others can stick with the more restricted Typed TSV format. Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format.
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Note that the '#' character is excluded from the comment data. Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Any unescaped '#' after the start of a line are errors.
Multiple consecutive comment lines are considered a single comment, with each line separated by a '\n'.
Comments must be UTF-8 encoded text. Comments must be UTF-8 encoded text.
@ -86,7 +78,7 @@ Note that extended formats must remain parseable by baseline parsers, hence we m
Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header. Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header.
Extended formats may still use the .ctsv extension, though they could use a dedicated one instead. Extended formats may still use the .ctsv extension, though they could use a dedicated one as well.
## Ideas for Extension ## Ideas for Extension

View File

@ -1,16 +0,0 @@
# Sane TSV
## Roadmap
- Improve error reporting by including line/column information in exceptions
- Come up with a static-typing interface
Something that doesn't require an array of objects
- Check numeric formatting matches spec
- Do parallel parsing / serializing implementation
- Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in.
- More optimization and making parsing modular:
- Have callbacks for header parsing and field parsing
- That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again.
- Finish ExtraTSV implementation
- Do zig implementation
- Make a c interface from that