'Sane' -> 'Simple' for first format

Change the naming so the overall family of formats is Sane TSV, while the simplest
format is Simple TSV.
This commit is contained in:
Nathan McRae 2024-02-15 12:52:27 -08:00
parent 38d324738e
commit 725a5b2034
3 changed files with 34 additions and 11 deletions

View File

@ -22,7 +22,7 @@ public class SaneTsv
protected enum FormatType protected enum FormatType
{ {
SANE_TSV = 0, SIMPLE_TSV = 0,
TYPED_TSV = 1, TYPED_TSV = 1,
COMMENTED_TSV = 2, COMMENTED_TSV = 2,
} }
@ -34,9 +34,9 @@ public class SaneTsv
public List<SaneTsvRecord> Records { get; protected set; } public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null; public string FileComment { get; protected set; } = null;
public static SaneTsv ParseSaneTsv(byte[] inputBuffer) public static SaneTsv ParseSimpleTsv(byte[] inputBuffer)
{ {
return Parse(inputBuffer, FormatType.SANE_TSV); return Parse(inputBuffer, FormatType.SIMPLE_TSV);
} }
public static SaneTsv ParseTypedTsv(byte[] inputBuffer) public static SaneTsv ParseTypedTsv(byte[] inputBuffer)
@ -135,7 +135,7 @@ public class SaneTsv
string columnTypeString; string columnTypeString;
string columnName; string columnName;
if (columnString.Contains(':')) { if (columnString.Contains(':')) {
if (format == FormatType.SANE_TSV) if (format == FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
} }
@ -144,7 +144,7 @@ public class SaneTsv
} }
else else
{ {
if (format > FormatType.SANE_TSV) if (format > FormatType.SIMPLE_TSV)
{ {
throw new Exception($"Header {fields.Count} has no type"); throw new Exception($"Header {fields.Count} has no type");
} }
@ -401,7 +401,7 @@ public class SaneTsv
return parsedFields; return parsedFields;
} }
public static byte[] SerializeSaneTsv(IList<string> header, IList<IList<string>> data) public static byte[] SerializeSimpleTsv(IList<string> header, IList<IList<string>> data)
{ {
var escapedString = new StringBuilder(); var escapedString = new StringBuilder();

View File

@ -39,7 +39,7 @@ using System.Text;
{ {
string testName = "Comment test"; string testName = "Comment test";
string testString1 = "#This is a file comment\n" + string testString1 = "#This is a file comment\n" +
" #One more file comment line\n" + "#One more file comment line\n" +
"column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" + "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
"\n#This is a comment" + "\n#This is a comment" +
"\n#Another comment line" + "\n#Another comment line" +
@ -49,4 +49,23 @@ using System.Text;
SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1)); SaneTsv parsed = SaneTsv.ParseCommentedTsv(Encoding.UTF8.GetBytes(testString1));
} }
{
string testName = "Serde test";
string testString1 = "column1\tcolumn2\tcolumnthree\\nyep" +
"\nTRUE\tvalue\\\\twoo\tvaluetrhee" +
"\nFALSE\tnother\tno\\ther";
SaneTsv parsed = SaneTsv.ParseSimpleTsv(Encoding.UTF8.GetBytes(testString1));
string serialized = Encoding.UTF8.GetString(SaneTsv.SerializeSimpleTsv(parsed.ColumnNames, parsed.Records.Select(r => r.Fields.Select(f => f.ToString()).ToArray()).ToArray()));
if (testString1 == serialized)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
Console.WriteLine("Done with tests"); Console.WriteLine("Done with tests");

View File

@ -1,6 +1,10 @@
# Sane TSV # Sane TSV
Sane TSV is a strict format for tabular data. Sane Tab-Separate Values is a series of tabular formats as an alternative to the under-specified TSV / CSV quagmire.
# Simple TSV
Simple TSV is a strict format for tabular data.
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line. '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
@ -20,7 +24,7 @@ Implementations of the format do not need to handle file reading and writing dir
# Typed TSV # Typed TSV
Typed TSV builds on Sane TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types: Typed TSV builds on Simple TSV to allow for typing of columns. All column names in a typed TSV must end with ':' (0x3A) and then one of the following types:
- 'string' - 'string'
- 'boolean' - 'boolean'
@ -34,7 +38,7 @@ Typed TSV builds on Sane TSV to allow for typing of columns. All column names in
Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters. Any other values are an error, however, the portion of the name prior to the last ':' may be anything and may include ':' characters.
All fields in the rest of the file must be of the type corresponding the their column. All fields in the rest of the file must be of the type corresponding to their column.
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions: Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
@ -55,7 +59,7 @@ Typed TSV files should have the .ytsv extension (.ttsv is already used).
Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format. Commented TSV builds on Typed TSV and allows for more flexibility in the format by including line comments. They are kept distinct so that some applications of it can take advantage of the extra flexibility, while others can stick with the more restricted Typed TSV format.
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. Any unescaped '#' after the start of a line are errors. Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
Comments must be UTF-8 encoded text. Comments must be UTF-8 encoded text.