Add freeform ParseSimpleTsv
No column spec, just parse whatever you get
This commit is contained in:
parent
7368ac816b
commit
3727f8051b
@ -8,7 +8,7 @@ public class Tsv<T> where T : SaneTsv.TsvRecord
|
|||||||
public virtual List<T> Records { get; set; }
|
public virtual List<T> Records { get; set; }
|
||||||
}
|
}
|
||||||
|
|
||||||
public class CommentedTsv<T>: Tsv<T> where T : SaneTsv.TsvRecord
|
public class CommentedTsv<T> : Tsv<T> where T : SaneTsv.TsvRecord
|
||||||
{
|
{
|
||||||
public override List<T> Records { get; set; }
|
public override List<T> Records { get; set; }
|
||||||
public string FileComment { get; set; } = null;
|
public string FileComment { get; set; } = null;
|
||||||
@ -72,7 +72,7 @@ public class SaneTsv
|
|||||||
public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new()
|
public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new()
|
||||||
{
|
{
|
||||||
// TODO: add the file comment?
|
// TODO: add the file comment?
|
||||||
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
|
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Have parsing errors include line / column #
|
// TODO: Have parsing errors include line / column #
|
||||||
@ -272,7 +272,7 @@ public class SaneTsv
|
|||||||
throw new Exception("Found a file comment, but parser wasn't expecting a comment");
|
throw new Exception("Found a file comment, but parser wasn't expecting a comment");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fields.Clear();
|
fields.Clear();
|
||||||
}
|
}
|
||||||
@ -355,7 +355,7 @@ public class SaneTsv
|
|||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static T ParseCurrentCommentedRecord<T> (Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
|
protected static T ParseCurrentCommentedRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
|
||||||
{
|
{
|
||||||
return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
|
return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
|
||||||
}
|
}
|
||||||
@ -372,7 +372,7 @@ public class SaneTsv
|
|||||||
{
|
{
|
||||||
throw new Exception($"Found comment for line {line}, but format does not support comments");
|
throw new Exception($"Found comment for line {line}, but format does not support comments");
|
||||||
}
|
}
|
||||||
|
|
||||||
record.Line = line;
|
record.Line = line;
|
||||||
|
|
||||||
for (int j = 0; j < fields.Count; j++)
|
for (int j = 0; j < fields.Count; j++)
|
||||||
@ -644,6 +644,157 @@ public class SaneTsv
|
|||||||
return Encoding.UTF8.GetBytes(escapedString.ToString());
|
return Encoding.UTF8.GetBytes(escapedString.ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
|
||||||
|
{
|
||||||
|
string[] columnNames = null;
|
||||||
|
|
||||||
|
var fieldBytes = new List<byte>();
|
||||||
|
var fields = new List<byte[]>();
|
||||||
|
var records = new List<string[]>();
|
||||||
|
|
||||||
|
int numFields = -1;
|
||||||
|
int line = 1;
|
||||||
|
int currentLineStart = 0;
|
||||||
|
for (int i = 0; i < inputBuffer.Count(); i++)
|
||||||
|
{
|
||||||
|
if (inputBuffer[i] == '\\')
|
||||||
|
{
|
||||||
|
if (i + 1 == inputBuffer.Count())
|
||||||
|
{
|
||||||
|
throw new Exception($"Found '\\' at end of input");
|
||||||
|
}
|
||||||
|
if (inputBuffer[i + 1] == 'n')
|
||||||
|
{
|
||||||
|
fieldBytes.Add((byte)'\n');
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i + 1] == '\\')
|
||||||
|
{
|
||||||
|
fieldBytes.Add((byte)'\\');
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i + 1] == 't')
|
||||||
|
{
|
||||||
|
fieldBytes.Add((byte)'\t');
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i + 1] == '#')
|
||||||
|
{
|
||||||
|
fieldBytes.Add((byte)'#');
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i] == '\t')
|
||||||
|
{
|
||||||
|
// end of field
|
||||||
|
fields.Add(fieldBytes.ToArray());
|
||||||
|
fieldBytes.Clear();
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i] == '\n')
|
||||||
|
{
|
||||||
|
fields.Add(fieldBytes.ToArray());
|
||||||
|
fieldBytes.Clear();
|
||||||
|
|
||||||
|
if (numFields < 0)
|
||||||
|
{
|
||||||
|
// This is the header
|
||||||
|
|
||||||
|
numFields = fields.Count;
|
||||||
|
|
||||||
|
columnNames = new string[numFields];
|
||||||
|
|
||||||
|
for (int j = 0; j < fields.Count; j++)
|
||||||
|
{
|
||||||
|
string columnString;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
columnString = Encoding.UTF8.GetString(fields[j]);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new Exception($"Column name {fields.Count} is not valid UTF-8", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (columnString.Contains(':'))
|
||||||
|
{
|
||||||
|
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
|
||||||
|
}
|
||||||
|
|
||||||
|
columnNames[j] = columnString;
|
||||||
|
}
|
||||||
|
|
||||||
|
fields.Clear();
|
||||||
|
}
|
||||||
|
else if (numFields != fields.Count)
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var fieldStrings = new string[fields.Count];
|
||||||
|
for (int j = 0; j < fields.Count; j++)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
records.Add(fieldStrings);
|
||||||
|
fields.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
line++;
|
||||||
|
currentLineStart = i + 1;
|
||||||
|
}
|
||||||
|
else if (inputBuffer[i] == '#')
|
||||||
|
{
|
||||||
|
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fieldBytes.Add(inputBuffer[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fields.Add(fieldBytes.ToArray());
|
||||||
|
|
||||||
|
if (numFields == 0)
|
||||||
|
{
|
||||||
|
throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
|
||||||
|
}
|
||||||
|
if (numFields != fields.Count)
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var fieldStrings = new string[fields.Count];
|
||||||
|
for (int j = 0; j < fields.Count; j++)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
records.Add(fieldStrings);
|
||||||
|
fields.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
return (columnNames, records.ToArray());
|
||||||
|
}
|
||||||
|
|
||||||
public static Type GetColumnFromType(Type type)
|
public static Type GetColumnFromType(Type type)
|
||||||
{
|
{
|
||||||
if (type == typeof(string))
|
if (type == typeof(string))
|
||||||
@ -1082,11 +1233,11 @@ public class SaneTsv
|
|||||||
public string ColumnName { get; }
|
public string ColumnName { get; }
|
||||||
public virtual Type ColumnType { get; }
|
public virtual Type ColumnType { get; }
|
||||||
|
|
||||||
public TsvColumnAttribute()
|
public TsvColumnAttribute()
|
||||||
{
|
{
|
||||||
ColumnType = typeof(StringType);
|
ColumnType = typeof(StringType);
|
||||||
}
|
}
|
||||||
public TsvColumnAttribute(string columnName)
|
public TsvColumnAttribute(string columnName)
|
||||||
{
|
{
|
||||||
ColumnType = typeof(StringType);
|
ColumnType = typeof(StringType);
|
||||||
ColumnName = columnName;
|
ColumnName = columnName;
|
||||||
|
Loading…
Reference in New Issue
Block a user