Add freeform ParseSimpleTsv

No column spec, just parse whatever you get
This commit is contained in:
Nathan McRae 2024-02-22 23:16:35 -08:00
parent 7368ac816b
commit 3727f8051b

View File

@ -8,7 +8,7 @@ public class Tsv<T> where T : SaneTsv.TsvRecord
public virtual List<T> Records { get; set; } public virtual List<T> Records { get; set; }
} }
public class CommentedTsv<T>: Tsv<T> where T : SaneTsv.TsvRecord public class CommentedTsv<T> : Tsv<T> where T : SaneTsv.TsvRecord
{ {
public override List<T> Records { get; set; } public override List<T> Records { get; set; }
public string FileComment { get; set; } = null; public string FileComment { get; set; } = null;
@ -72,7 +72,7 @@ public class SaneTsv
public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new() public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new()
{ {
// TODO: add the file comment? // TODO: add the file comment?
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV); return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
} }
// TODO: Have parsing errors include line / column # // TODO: Have parsing errors include line / column #
@ -272,7 +272,7 @@ public class SaneTsv
throw new Exception("Found a file comment, but parser wasn't expecting a comment"); throw new Exception("Found a file comment, but parser wasn't expecting a comment");
} }
} }
fields.Clear(); fields.Clear();
} }
@ -355,7 +355,7 @@ public class SaneTsv
return parsed; return parsed;
} }
protected static T ParseCurrentCommentedRecord<T> (Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new() protected static T ParseCurrentCommentedRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
{ {
return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line); return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
} }
@ -372,7 +372,7 @@ public class SaneTsv
{ {
throw new Exception($"Found comment for line {line}, but format does not support comments"); throw new Exception($"Found comment for line {line}, but format does not support comments");
} }
record.Line = line; record.Line = line;
for (int j = 0; j < fields.Count; j++) for (int j = 0; j < fields.Count; j++)
@ -644,6 +644,157 @@ public class SaneTsv
return Encoding.UTF8.GetBytes(escapedString.ToString()); return Encoding.UTF8.GetBytes(escapedString.ToString());
} }
public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
{
string[] columnNames = null;
var fieldBytes = new List<byte>();
var fields = new List<byte[]>();
var records = new List<string[]>();
int numFields = -1;
int line = 1;
int currentLineStart = 0;
for (int i = 0; i < inputBuffer.Count(); i++)
{
if (inputBuffer[i] == '\\')
{
if (i + 1 == inputBuffer.Count())
{
throw new Exception($"Found '\\' at end of input");
}
if (inputBuffer[i + 1] == 'n')
{
fieldBytes.Add((byte)'\n');
i++;
}
else if (inputBuffer[i + 1] == '\\')
{
fieldBytes.Add((byte)'\\');
i++;
}
else if (inputBuffer[i + 1] == 't')
{
fieldBytes.Add((byte)'\t');
i++;
}
else if (inputBuffer[i + 1] == '#')
{
fieldBytes.Add((byte)'#');
i++;
}
else
{
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
}
}
else if (inputBuffer[i] == '\t')
{
// end of field
fields.Add(fieldBytes.ToArray());
fieldBytes.Clear();
}
else if (inputBuffer[i] == '\n')
{
fields.Add(fieldBytes.ToArray());
fieldBytes.Clear();
if (numFields < 0)
{
// This is the header
numFields = fields.Count;
columnNames = new string[numFields];
for (int j = 0; j < fields.Count; j++)
{
string columnString;
try
{
columnString = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Column name {fields.Count} is not valid UTF-8", e);
}
if (columnString.Contains(':'))
{
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
}
columnNames[j] = columnString;
}
fields.Clear();
}
else if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
var fieldStrings = new string[fields.Count];
for (int j = 0; j < fields.Count; j++)
{
try
{
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
}
}
records.Add(fieldStrings);
fields.Clear();
}
line++;
currentLineStart = i + 1;
}
else if (inputBuffer[i] == '#')
{
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
}
else
{
fieldBytes.Add(inputBuffer[i]);
}
}
fields.Add(fieldBytes.ToArray());
if (numFields == 0)
{
throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
}
if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
var fieldStrings = new string[fields.Count];
for (int j = 0; j < fields.Count; j++)
{
try
{
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
}
}
records.Add(fieldStrings);
fields.Clear();
}
return (columnNames, records.ToArray());
}
public static Type GetColumnFromType(Type type) public static Type GetColumnFromType(Type type)
{ {
if (type == typeof(string)) if (type == typeof(string))
@ -1082,11 +1233,11 @@ public class SaneTsv
public string ColumnName { get; } public string ColumnName { get; }
public virtual Type ColumnType { get; } public virtual Type ColumnType { get; }
public TsvColumnAttribute() public TsvColumnAttribute()
{ {
ColumnType = typeof(StringType); ColumnType = typeof(StringType);
} }
public TsvColumnAttribute(string columnName) public TsvColumnAttribute(string columnName)
{ {
ColumnType = typeof(StringType); ColumnType = typeof(StringType);
ColumnName = columnName; ColumnName = columnName;