Compare commits
10 Commits
cc8a122b57
...
0b302734e9
Author | SHA1 | Date | |
---|---|---|---|
|
0b302734e9 | ||
|
7bc553905d | ||
|
932fbd553a | ||
|
93f2e2ea5b | ||
|
99766f99a6 | ||
|
a5eedef36b | ||
|
ee46c93ce1 | ||
|
83602391ab | ||
|
725a5b2034 | ||
|
38d324738e |
125
SaneTsv/ExtraTsv/ExtraTsv.cs
Normal file
125
SaneTsv/ExtraTsv/ExtraTsv.cs
Normal file
@ -0,0 +1,125 @@
|
||||
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace NathanMcRae;
|
||||
|
||||
public class ExtraTsv : SaneTsv
|
||||
{
|
||||
public class Iso8601Type : ColumnType { }
|
||||
public class PhysicalUnitsType : ColumnType
|
||||
{
|
||||
public string Units { get; }
|
||||
public PhysicalUnitsType(string Units) { }
|
||||
}
|
||||
|
||||
public static readonly string[] ValidUnits =
|
||||
{
|
||||
"m",
|
||||
"s",
|
||||
"A",
|
||||
"K",
|
||||
"cd",
|
||||
"mol",
|
||||
"kg",
|
||||
"Hz",
|
||||
"rad",
|
||||
"sr",
|
||||
"N",
|
||||
"Pa",
|
||||
"J",
|
||||
"W",
|
||||
"C",
|
||||
"V",
|
||||
"F",
|
||||
"Ω",
|
||||
"S",
|
||||
"Wb",
|
||||
"T",
|
||||
"H",
|
||||
"°C",
|
||||
"lm",
|
||||
"lx",
|
||||
"Bq",
|
||||
"Gy",
|
||||
"Sv",
|
||||
"kat"
|
||||
};
|
||||
|
||||
public static readonly int MajorVersion = 0;
|
||||
public static readonly int MinorVersion = 0;
|
||||
public static readonly int PatchVersion = 1;
|
||||
|
||||
public static Regex VersionRegex = new Regex(@"^ ExtraTSV V(\d+)\.(\d+)\.(\d+)");
|
||||
|
||||
public static ExtraTsv ParseExtraTsv(byte[] inputBuffer)
|
||||
{
|
||||
SaneTsv tsv = ParseCommentedTsv(inputBuffer);
|
||||
|
||||
if (tsv.FileComment == null) {
|
||||
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
|
||||
}
|
||||
|
||||
Match match = VersionRegex.Match(tsv.FileComment);
|
||||
if (!match.Success)
|
||||
{
|
||||
throw new Exception($"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}");
|
||||
}
|
||||
|
||||
int fileMajorVersion = int.Parse(match.Groups[1].Value);
|
||||
|
||||
if (fileMajorVersion != MajorVersion)
|
||||
{
|
||||
throw new Exception($"File has major version ({fileMajorVersion}) which is newer than this parser's version {MajorVersion}");
|
||||
}
|
||||
|
||||
for (int i = 0; i < tsv.ColumnNames.Count(); i++)
|
||||
{
|
||||
string[] typeParts = tsv.ColumnNames[i].Split(":");
|
||||
if (typeParts[typeParts.Length - 1] == "iso8601" && tsv.ColumnTypes[i] == typeof(StringType))
|
||||
{
|
||||
string columnName = tsv.ColumnNames[i].Substring(0, tsv.ColumnNames[i].Length - ":iso8601".Length);
|
||||
tsv.ColumnNames[i] = columnName;
|
||||
tsv.ColumnTypes[i] = typeof(Iso8601Type);
|
||||
}
|
||||
// TODO: ISO8601 time spans
|
||||
// TODO: ISO8601 time durations
|
||||
else if (typeParts[typeParts.Length - 1] == "units" && (tsv.ColumnTypes[i] == typeof(Float64Type) || tsv.ColumnTypes[i] == typeof(Float32Type)))
|
||||
{
|
||||
if (typeParts.Count() > 1 && ValidUnits.Contains(typeParts[typeParts.Length - 2]))
|
||||
{
|
||||
// TODO: How to store type information since the ColumnTypes is of type Type?
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Invalid units type '{typeParts[typeParts.Length - 2]}' for column {i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CultureInfo provider = CultureInfo.InvariantCulture;
|
||||
|
||||
for (int i = 0; i < tsv.Records.Count; i++)
|
||||
{
|
||||
if (tsv.Records[i].Comment != null)
|
||||
{
|
||||
throw new Exception($"Line {tsv.Records[i].Line} has comment above it which is not allowed");
|
||||
}
|
||||
|
||||
for (int j = 0; j < tsv.ColumnNames.Count(); j++)
|
||||
{
|
||||
if (tsv.ColumnTypes[j] == typeof(Iso8601Type))
|
||||
{
|
||||
if (!DateTime.TryParseExact((string)tsv.Records[i][j], "yyyy-MM-ddTHH:mm:ss.ffff", provider, DateTimeStyles.None, out DateTime parsed))
|
||||
{
|
||||
throw new Exception($"ISO 8601 timestamp format error on line {tsv.Records[i].Line}, field {j}");
|
||||
}
|
||||
|
||||
tsv.Records[i].Fields[j] = parsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (ExtraTsv)tsv;
|
||||
}
|
||||
}
|
14
SaneTsv/ExtraTsv/ExtraTsv.csproj
Normal file
14
SaneTsv/ExtraTsv/ExtraTsv.csproj
Normal file
@ -0,0 +1,14 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<RootNamespace>NathanMcRae</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\SaneTsv.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
44
SaneTsv/ExtraTsv/readme.md
Normal file
44
SaneTsv/ExtraTsv/readme.md
Normal file
@ -0,0 +1,44 @@
|
||||
Extra TSV adds many convenience types to Sane TSV:
|
||||
|
||||
- Timestamps
|
||||
Just this format for now: yyyy-MM-ddTHH:mm:ss.ffff
|
||||
- Timespans
|
||||
- Time durations
|
||||
- Multiformats
|
||||
- Multihashes
|
||||
- Multiprotocols
|
||||
- ...
|
||||
- Physical units
|
||||
To start with, just use SI base and derived units
|
||||
- Base units
|
||||
- m
|
||||
- s
|
||||
- A
|
||||
- K
|
||||
- cd
|
||||
- mol
|
||||
- kg
|
||||
- Derived units
|
||||
- Hz
|
||||
- rad
|
||||
- sr
|
||||
- N
|
||||
- Pa
|
||||
- J
|
||||
- W
|
||||
- C
|
||||
- V
|
||||
- F
|
||||
- Ω
|
||||
- S
|
||||
- Wb
|
||||
- T
|
||||
- H
|
||||
- °C
|
||||
- lm
|
||||
- lx
|
||||
- Bq
|
||||
- Gy
|
||||
- Sv
|
||||
- kat
|
||||
How to handle derived units?
|
14
SaneTsv/ExtraTsvTest/ExtraTsvTest.csproj
Normal file
14
SaneTsv/ExtraTsvTest/ExtraTsvTest.csproj
Normal file
@ -0,0 +1,14 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ExtraTsv\ExtraTsv.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
20
SaneTsv/ExtraTsvTest/Program.cs
Normal file
20
SaneTsv/ExtraTsvTest/Program.cs
Normal file
@ -0,0 +1,20 @@
|
||||
using NathanMcRae;
|
||||
using System.Text;
|
||||
|
||||
{
|
||||
string testName = "Bool test";
|
||||
string testString1 = "# ExtraTSV V0.0.1\n" +
|
||||
"column1:ty\\#pe:boolean\tcolumn2:binary\tcolumnthree\\nyep:iso8601:string" +
|
||||
"\nTRUE\tvalue\\\\t\0woo\t2024-02-15T18:03:30.0000" +
|
||||
"\nFALSE\tnother\t2024-02-15T18:03:39.0001";
|
||||
|
||||
ExtraTsv parsed = ExtraTsv.ParseExtraTsv(Encoding.UTF8.GetBytes(testString1));
|
||||
if (parsed.Records[0]["column1:ty#pe"] is bool result && result)
|
||||
{
|
||||
Console.WriteLine($"Passed {testName}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed {testName}");
|
||||
}
|
||||
}
|
@ -7,36 +7,34 @@ namespace NathanMcRae;
|
||||
/// </summary>
|
||||
public class SaneTsv
|
||||
{
|
||||
public enum ColumnType
|
||||
{
|
||||
STRING,
|
||||
BOOLEAN,
|
||||
FLOAT32,
|
||||
FLOAT64,
|
||||
UINT32,
|
||||
UINT64,
|
||||
INT32,
|
||||
INT64,
|
||||
BINARY,
|
||||
}
|
||||
// Like an enum, but more extensible
|
||||
public class ColumnType { }
|
||||
public class StringType : ColumnType { }
|
||||
public class BooleanType : ColumnType { }
|
||||
public class Float32Type : ColumnType { }
|
||||
public class Float64Type : ColumnType { }
|
||||
public class UInt32Type : ColumnType { }
|
||||
public class UInt64Type : ColumnType { }
|
||||
public class Int32Type : ColumnType { }
|
||||
public class Int64Type : ColumnType { }
|
||||
public class BinaryType : ColumnType { }
|
||||
|
||||
protected enum FormatType
|
||||
{
|
||||
SANE_TSV = 0,
|
||||
SIMPLE_TSV = 0,
|
||||
TYPED_TSV = 1,
|
||||
COMMENTED_TSV = 2,
|
||||
}
|
||||
|
||||
// TODO: We need to be able to update all these in tandem somehow
|
||||
public string[] ColumnNames { get; protected set; }
|
||||
public ColumnType[] ColumnTypes { get; protected set; }
|
||||
public Dictionary<string, List<object>> Columns { get; protected set; }
|
||||
public Type[] ColumnTypes { get; protected set; }
|
||||
public List<SaneTsvRecord> Records { get; protected set; }
|
||||
public string FileComment { get; protected set; } = null;
|
||||
|
||||
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
|
||||
public static SaneTsv ParseSimpleTsv(byte[] inputBuffer)
|
||||
{
|
||||
return Parse(inputBuffer, FormatType.SANE_TSV);
|
||||
return Parse(inputBuffer, FormatType.SIMPLE_TSV);
|
||||
}
|
||||
|
||||
public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
|
||||
@ -53,9 +51,8 @@ public class SaneTsv
|
||||
protected static SaneTsv Parse(byte[] inputBuffer, FormatType format)
|
||||
{
|
||||
var parsed = new SaneTsv();
|
||||
parsed.Columns = new Dictionary<string, List<object>>();
|
||||
parsed.ColumnNames = new string[] { };
|
||||
parsed.ColumnTypes = new ColumnType[] { };
|
||||
parsed.ColumnTypes = new Type[] { };
|
||||
parsed.Records = new List<SaneTsvRecord>();
|
||||
|
||||
var fieldBytes = new List<byte>();
|
||||
@ -95,7 +92,7 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
|
||||
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
|
||||
}
|
||||
}
|
||||
else if (inputBuffer[i] == '\t')
|
||||
@ -116,7 +113,7 @@ public class SaneTsv
|
||||
numFields = fields.Count;
|
||||
|
||||
parsed.ColumnNames = new string[numFields];
|
||||
parsed.ColumnTypes = new ColumnType[numFields];
|
||||
parsed.ColumnTypes = new Type[numFields];
|
||||
|
||||
int numTypesBlank = 0;
|
||||
|
||||
@ -135,7 +132,7 @@ public class SaneTsv
|
||||
string columnTypeString;
|
||||
string columnName;
|
||||
if (columnString.Contains(':')) {
|
||||
if (format == FormatType.SANE_TSV)
|
||||
if (format == FormatType.SIMPLE_TSV)
|
||||
{
|
||||
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
|
||||
}
|
||||
@ -144,7 +141,7 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
if (format > FormatType.SANE_TSV)
|
||||
if (format > FormatType.SIMPLE_TSV)
|
||||
{
|
||||
throw new Exception($"Header {fields.Count} has no type");
|
||||
}
|
||||
@ -152,53 +149,46 @@ public class SaneTsv
|
||||
columnName = columnString;
|
||||
}
|
||||
|
||||
ColumnType type;
|
||||
Type type;
|
||||
|
||||
switch (columnTypeString)
|
||||
{
|
||||
case "":
|
||||
numTypesBlank++;
|
||||
type = ColumnType.STRING;
|
||||
type = typeof(StringType);
|
||||
break;
|
||||
case "string":
|
||||
type = ColumnType.STRING;
|
||||
type = typeof(StringType);
|
||||
break;
|
||||
case "boolean":
|
||||
type = ColumnType.BOOLEAN;
|
||||
type = typeof(BooleanType);
|
||||
break;
|
||||
case "float32":
|
||||
type = ColumnType.FLOAT32;
|
||||
type = typeof(Float32Type);
|
||||
break;
|
||||
case "float64":
|
||||
type = ColumnType.FLOAT64;
|
||||
type = typeof(Float64Type);
|
||||
break;
|
||||
case "uint32":
|
||||
type = ColumnType.UINT32;
|
||||
type = typeof(UInt32Type);
|
||||
break;
|
||||
case "uint64":
|
||||
type = ColumnType.UINT64;
|
||||
type = typeof(UInt64Type);
|
||||
break;
|
||||
case "int32":
|
||||
type = ColumnType.INT32;
|
||||
type = typeof(Int32Type);
|
||||
break;
|
||||
case "int64":
|
||||
type = ColumnType.INT64;
|
||||
type = typeof(Int64Type);
|
||||
break;
|
||||
case "binary":
|
||||
type = ColumnType.BINARY;
|
||||
type = typeof(BinaryType);
|
||||
break;
|
||||
default:
|
||||
throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
parsed.Columns.Add(columnName, new List<object>());
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception($"Column name {columnName} is not unique", e);
|
||||
}
|
||||
// TODO: Check column name uniqueness
|
||||
|
||||
parsed.ColumnNames[j] = columnName;
|
||||
parsed.ColumnTypes[j] = type;
|
||||
@ -224,7 +214,7 @@ public class SaneTsv
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
@ -241,8 +231,11 @@ public class SaneTsv
|
||||
{
|
||||
var commentBytes = new byte[j - i - 1];
|
||||
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
currentComment.Append('\n');
|
||||
}
|
||||
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
|
||||
currentComment.Append("\n");
|
||||
i = j;
|
||||
currentLineStart = i + 1;
|
||||
line++;
|
||||
@ -281,23 +274,25 @@ public class SaneTsv
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment, line));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Note: this modifies 'parsed'
|
||||
/// </summary>
|
||||
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
||||
{
|
||||
var parsedFields = new object[fields.Count];
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
{
|
||||
// All other types require the content to be UTF-8. Binary fields can ignore that.
|
||||
if (parsed.ColumnTypes[j] == ColumnType.BINARY)
|
||||
if (parsed.ColumnTypes[j] == typeof(BinaryType))
|
||||
{
|
||||
parsedFields[j] = fields[j];
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -311,13 +306,14 @@ public class SaneTsv
|
||||
throw new Exception($"Field {j} on line {line} is not valid UTF-8", e);
|
||||
}
|
||||
|
||||
switch (parsed.ColumnTypes[j])
|
||||
// TODO: Add checking for numeric types format
|
||||
|
||||
if (parsed.ColumnTypes[j] == typeof(StringType))
|
||||
{
|
||||
case ColumnType.STRING:
|
||||
parsedFields[j] = fieldString;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(fieldString);
|
||||
break;
|
||||
case ColumnType.BOOLEAN:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(BooleanType))
|
||||
{
|
||||
bool parsedBool;
|
||||
if (fieldString == "TRUE")
|
||||
{
|
||||
@ -333,65 +329,63 @@ public class SaneTsv
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedBool;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool);
|
||||
break;
|
||||
case ColumnType.FLOAT32:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(Float32Type))
|
||||
{
|
||||
if (!float.TryParse(fieldString, out float parsedFloat))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid single-precision float");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedFloat;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat);
|
||||
break;
|
||||
case ColumnType.FLOAT64:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(Float64Type))
|
||||
{
|
||||
if (!double.TryParse(fieldString, out double parsedDouble))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid double-precision float");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedDouble;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble);
|
||||
break;
|
||||
case ColumnType.UINT32:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(UInt32Type))
|
||||
{
|
||||
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid UInt32");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedUInt32;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32);
|
||||
break;
|
||||
case ColumnType.UINT64:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(UInt64Type))
|
||||
{
|
||||
if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid UInt64");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedUInt64;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64);
|
||||
break;
|
||||
case ColumnType.INT32:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(Int32Type))
|
||||
{
|
||||
if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid Int32");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedInt32;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32);
|
||||
break;
|
||||
case ColumnType.INT64:
|
||||
}
|
||||
else if (parsed.ColumnTypes[j] == typeof(Int64Type))
|
||||
{
|
||||
if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
|
||||
{
|
||||
throw new Exception($"Field {j} on line {line} is not valid Int64");
|
||||
}
|
||||
|
||||
parsedFields[j] = parsedInt64;
|
||||
parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64);
|
||||
break;
|
||||
case ColumnType.BINARY:
|
||||
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
|
||||
default:
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
|
||||
}
|
||||
}
|
||||
@ -399,6 +393,103 @@ public class SaneTsv
|
||||
return parsedFields;
|
||||
}
|
||||
|
||||
public static byte[] SerializeSimpleTsv(IList<string> header, IList<IList<string>> data)
|
||||
{
|
||||
var escapedString = new StringBuilder();
|
||||
|
||||
// Serialize header
|
||||
for (int i = 0; i < header.Count; i++)
|
||||
{
|
||||
if (header[i].Contains(':'))
|
||||
{
|
||||
throw new Exception($"Column {i} contains the character ':'");
|
||||
}
|
||||
|
||||
for (int j = i + 1; j < header.Count; j++)
|
||||
{
|
||||
if (header[i] == header[j])
|
||||
{
|
||||
throw new Exception("Column names in header must be unique");
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < header[i].Count(); j++)
|
||||
{
|
||||
if (header[i][j] == '\n')
|
||||
{
|
||||
escapedString.Append("\\n");
|
||||
}
|
||||
else if (header[i][j] == '\t')
|
||||
{
|
||||
escapedString.Append("\\t");
|
||||
}
|
||||
else if (header[i][j] == '\\')
|
||||
{
|
||||
escapedString.Append("\\\\");
|
||||
}
|
||||
else if (header[i][j] == '#')
|
||||
{
|
||||
escapedString.Append("\\#");
|
||||
}
|
||||
else
|
||||
{
|
||||
escapedString.Append(header[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
if (i == header.Count - 1)
|
||||
{
|
||||
escapedString.Append('\n');
|
||||
}
|
||||
else
|
||||
{
|
||||
escapedString.Append('\t');
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize data
|
||||
for (int i = 0; i < data.Count; i++)
|
||||
{
|
||||
for (int j = 0; j < data[i].Count; j++)
|
||||
{
|
||||
for (int k = 0; k < data[i][j].Length; k++)
|
||||
{
|
||||
if (data[i][j][k] == '\n')
|
||||
{
|
||||
escapedString.Append("\\n");
|
||||
}
|
||||
else if (data[i][j][k] == '\t')
|
||||
{
|
||||
escapedString.Append("\\t");
|
||||
}
|
||||
else if (data[i][j][k] == '\\')
|
||||
{
|
||||
escapedString.Append("\\\\");
|
||||
}
|
||||
else if (data[i][j][k] == '#')
|
||||
{
|
||||
escapedString.Append("\\#");
|
||||
}
|
||||
else
|
||||
{
|
||||
escapedString.Append(data[i][j][k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (j < data[i].Count - 1)
|
||||
{
|
||||
escapedString.Append('\t');
|
||||
}
|
||||
else if (i < data.Count - 1)
|
||||
{
|
||||
escapedString.Append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Encoding.UTF8.GetBytes(escapedString.ToString());
|
||||
}
|
||||
|
||||
public SaneTsvRecord this[int i] => Records[i];
|
||||
|
||||
public class SaneTsvRecord
|
||||
@ -406,14 +497,18 @@ public class SaneTsv
|
||||
public SaneTsv Parent { get; }
|
||||
public string Comment { get; }
|
||||
public object[] Fields { get; }
|
||||
public int Line { get; }
|
||||
|
||||
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
||||
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
|
||||
public object this[int columnIndex] => Fields[columnIndex];
|
||||
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment, int line)
|
||||
{
|
||||
Parent = parent;
|
||||
Fields = fields;
|
||||
Comment = comment;
|
||||
Line = line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10,8 +10,14 @@
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Compile Remove="ExtraTsvTest\**" />
|
||||
<Compile Remove="ExtraTsv\**" />
|
||||
<Compile Remove="SaneTsvTest\**" />
|
||||
<EmbeddedResource Remove="ExtraTsvTest\**" />
|
||||
<EmbeddedResource Remove="ExtraTsv\**" />
|
||||
<EmbeddedResource Remove="SaneTsvTest\**" />
|
||||
<None Remove="ExtraTsvTest\**" />
|
||||
<None Remove="ExtraTsv\**" />
|
||||
<None Remove="SaneTsvTest\**" />
|
||||
</ItemGroup>
|
||||
|
||||
|
@ -5,7 +5,11 @@ VisualStudioVersion = 17.7.34024.191
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsv", "SaneTsv.csproj", "{DBC5CE44-361C-4387-B1E2-409C1CAE2B4C}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SaneTsvTest", "SaneTsvTest\SaneTsvTest.csproj", "{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ExtraTsv", "ExtraTsv\ExtraTsv.csproj", "{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ExtraTsvTest", "ExtraTsvTest\ExtraTsvTest.csproj", "{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
@ -21,6 +25,14 @@ Global
|
||||
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{43B1B09C-19BD-4B45-B41B-7C00DB3F7E9C}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{D9F2E9C8-4F52-4BB7-9BBD-AE9A0C6168E7}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{A545B0DB-F799-43E2-9DFA-C18BDF3535F1}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -49,4 +49,26 @@ using System.Text;
|
||||
SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1));
|
||||
}
|
||||
|
||||
{
|
||||
string testName = "Serde test";
|
||||
string testString1 = "column1\tcolumn2\tcolumnthree\\nyep" +
|
||||
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
|
||||
"\nFALSE\tnother\tno\\ther";
|
||||
|
||||
SaneTsv parsed = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
|
||||
string serialized = Encoding.UTF8.GetString(SaneTsv.SerializeSimpleTsv(parsed.ColumnNames, parsed.Records.Select(r => r.Fields.Select(f => f.ToString()).ToArray()).ToArray()));
|
||||
|
||||
if (testString1 == serialized)
|
||||
{
|
||||
Console.WriteLine($"Passed {testName}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Failed {testName}");
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine("Done with tests");
|
||||
|
||||
|
||||
// TODO: Check qNaN, sNaN, +inf, -inf values for float types
|
||||
|
@ -1,6 +1,10 @@
|
||||
# Sane TSV
|
||||
|
||||
Sane TSV is a strict format for tabular data.
|
||||
Sane Tab-Separate Values is a series of tabular formats as an alternative to the under-specified TSV / CSV quagmire.
|
||||
|
||||
# Simple TSV
|
||||
|
||||
Simple TSV is a strict format for tabular data.
|
||||
|
||||
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
|
||||
|
||||
@ -12,7 +16,7 @@ Empty fields (i.e. two subsequent '\t' characters) are allowed.
|
||||
|
||||
The first line is always the header and the fields of the header are the column names for the file. Column names must be unique within the file and must not contain ':' characters (for compatibility with [Typed TSVs](#typed-tsv)).
|
||||
|
||||
All lines in the file must have the same number of fields.
|
||||
All lines in the file must have the same number of fields as are in the header.
|
||||
|
||||
The file must not end with '\n'. That will be treated as if there is an empty row at the end of a file and cause an error.
|
||||
|
||||
@ -20,7 +24,7 @@ Implementations of the format do not need to handle file reading and writing dir
|
||||
|
||||
# Typed TSV
|
||||
|
||||
Typed TSV builds on Sane TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
|
||||
Typed TSV builds on Simple TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
|
||||
|
||||
- 'string'
|
||||
- 'boolean'
|
||||
@ -34,7 +38,7 @@ Typed TSV builds on Sane TSV to allow for typing of columns. All column names in
|
||||
|
||||
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
|
||||
|
||||
All fields in the rest of the file must be of the type corresponding the their column.
|
||||
All fields in the rest of the file must be of the type corresponding to their column.
|
||||
|
||||
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
|
||||
|
||||
@ -49,13 +53,17 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
|
||||
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
|
||||
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
|
||||
|
||||
Binary fields are left as-is (after unescaping is performed).
|
||||
|
||||
Typed TSV files should have the .ytsv extension (.ttsv is already used).
|
||||
|
||||
# Commented TSV
|
||||
|
||||
Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format.
|
||||
Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. The formats are kept distinct so that some applications can take advantage of the extra flexibility comments allow, while others can stick with the more restricted Typed TSV format.
|
||||
|
||||
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Any unescaped '#' after the start of a line are errors.
|
||||
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Note that the '#' character is excluded from the comment data.
|
||||
|
||||
Multiple consecutive comment lines are considered a single comment, with each line separated by a '\n'.
|
||||
|
||||
Comments must be UTF-8 encoded text.
|
||||
|
||||
@ -78,7 +86,7 @@ Note that extended formats must remain parseable by baseline parsers, hence we m
|
||||
|
||||
Extending formats may also have restrictions. For example, they could disallow record comments and only allow the file comment above the header.
|
||||
|
||||
Extended formats may still use the .ctsv extension, though they could use a dedicated one as well.
|
||||
Extended formats may still use the .ctsv extension, though they could use a dedicated one instead.
|
||||
|
||||
## Ideas for Extension
|
||||
|
||||
|
16
readme.md
Normal file
16
readme.md
Normal file
@ -0,0 +1,16 @@
|
||||
# Sane TSV
|
||||
|
||||
## Roadmap
|
||||
|
||||
- Improve error reporting by including line/column information in exceptions
|
||||
- Come up with a static-typing interface
|
||||
Something that doesn't require an array of objects
|
||||
- Check numeric formatting matches spec
|
||||
- Do parallel parsing / serializing implementation
|
||||
- Next task: Refactor parsing so that it will start and end at arbitrary indices and return an array of SaneTsvRecords. The refactor should ignore the current record (unless at the start of the buffer) and continue parsing the record the end index is in.
|
||||
- More optimization and making parsing modular:
|
||||
- Have callbacks for header parsing and field parsing
|
||||
- That way other formats (like ExtraTSV) don't have to iterate through the entire set of data again.
|
||||
- Finish ExtraTSV implementation
|
||||
- Do zig implementation
|
||||
- Make a c interface from that
|
Loading…
Reference in New Issue
Block a user