Add freeform ParseSimpleTsv
No column spec, just parse whatever you get
This commit is contained in:
parent
695ad1f110
commit
0b213c98c2
157
SaneTsv.cs
157
SaneTsv.cs
@ -8,7 +8,7 @@ public class Tsv<T> where T : SaneTsv.TsvRecord
|
||||
public virtual List<T> Records { get; set; }
|
||||
}
|
||||
|
||||
public class CommentedTsv<T>: Tsv<T> where T : SaneTsv.TsvRecord
|
||||
public class CommentedTsv<T> : Tsv<T> where T : SaneTsv.TsvRecord
|
||||
{
|
||||
public override List<T> Records { get; set; }
|
||||
public string FileComment { get; set; } = null;
|
||||
@ -72,7 +72,7 @@ public class SaneTsv
|
||||
public static CommentedTsv<T> ParseCommentedTsv<T>(byte[] inputBuffer) where T : CommentedTsvRecord, new()
|
||||
{
|
||||
// TODO: add the file comment?
|
||||
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
|
||||
return (CommentedTsv<T>)Parse<T>(inputBuffer, FormatType.COMMENTED_TSV);
|
||||
}
|
||||
|
||||
// TODO: Have parsing errors include line / column #
|
||||
@ -355,7 +355,7 @@ public class SaneTsv
|
||||
return parsed;
|
||||
}
|
||||
|
||||
protected static T ParseCurrentCommentedRecord<T> (Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
|
||||
protected static T ParseCurrentCommentedRecord<T>(Type[] columnTypes, PropertyInfo[] properties, List<byte[]> fields, string comment, int line) where T : CommentedTsvRecord, new()
|
||||
{
|
||||
return (T)ParseCurrentRecord<T>(columnTypes, properties, fields, comment, line);
|
||||
}
|
||||
@ -644,6 +644,157 @@ public class SaneTsv
|
||||
return Encoding.UTF8.GetBytes(escapedString.ToString());
|
||||
}
|
||||
|
||||
public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
|
||||
{
|
||||
string[] columnNames = null;
|
||||
|
||||
var fieldBytes = new List<byte>();
|
||||
var fields = new List<byte[]>();
|
||||
var records = new List<string[]>();
|
||||
|
||||
int numFields = -1;
|
||||
int line = 1;
|
||||
int currentLineStart = 0;
|
||||
for (int i = 0; i < inputBuffer.Count(); i++)
|
||||
{
|
||||
if (inputBuffer[i] == '\\')
|
||||
{
|
||||
if (i + 1 == inputBuffer.Count())
|
||||
{
|
||||
throw new Exception($"Found '\\' at end of input");
|
||||
}
|
||||
if (inputBuffer[i + 1] == 'n')
|
||||
{
|
||||
fieldBytes.Add((byte)'\n');
|
||||
i++;
|
||||
}
|
||||
else if (inputBuffer[i + 1] == '\\')
|
||||
{
|
||||
fieldBytes.Add((byte)'\\');
|
||||
i++;
|
||||
}
|
||||
else if (inputBuffer[i + 1] == 't')
|
||||
{
|
||||
fieldBytes.Add((byte)'\t');
|
||||
i++;
|
||||
}
|
||||
else if (inputBuffer[i + 1] == '#')
|
||||
{
|
||||
fieldBytes.Add((byte)'#');
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
|
||||
}
|
||||
}
|
||||
else if (inputBuffer[i] == '\t')
|
||||
{
|
||||
// end of field
|
||||
fields.Add(fieldBytes.ToArray());
|
||||
fieldBytes.Clear();
|
||||
}
|
||||
else if (inputBuffer[i] == '\n')
|
||||
{
|
||||
fields.Add(fieldBytes.ToArray());
|
||||
fieldBytes.Clear();
|
||||
|
||||
if (numFields < 0)
|
||||
{
|
||||
// This is the header
|
||||
|
||||
numFields = fields.Count;
|
||||
|
||||
columnNames = new string[numFields];
|
||||
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
{
|
||||
string columnString;
|
||||
try
|
||||
{
|
||||
columnString = Encoding.UTF8.GetString(fields[j]);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception($"Column name {fields.Count} is not valid UTF-8", e);
|
||||
}
|
||||
|
||||
if (columnString.Contains(':'))
|
||||
{
|
||||
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
|
||||
}
|
||||
|
||||
columnNames[j] = columnString;
|
||||
}
|
||||
|
||||
fields.Clear();
|
||||
}
|
||||
else if (numFields != fields.Count)
|
||||
{
|
||||
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
|
||||
}
|
||||
else
|
||||
{
|
||||
var fieldStrings = new string[fields.Count];
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
{
|
||||
try
|
||||
{
|
||||
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
|
||||
}
|
||||
}
|
||||
records.Add(fieldStrings);
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
line++;
|
||||
currentLineStart = i + 1;
|
||||
}
|
||||
else if (inputBuffer[i] == '#')
|
||||
{
|
||||
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
|
||||
}
|
||||
else
|
||||
{
|
||||
fieldBytes.Add(inputBuffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
fields.Add(fieldBytes.ToArray());
|
||||
|
||||
if (numFields == 0)
|
||||
{
|
||||
throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
|
||||
}
|
||||
if (numFields != fields.Count)
|
||||
{
|
||||
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
|
||||
}
|
||||
else
|
||||
{
|
||||
var fieldStrings = new string[fields.Count];
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
{
|
||||
try
|
||||
{
|
||||
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
|
||||
}
|
||||
}
|
||||
records.Add(fieldStrings);
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
return (columnNames, records.ToArray());
|
||||
}
|
||||
|
||||
public static Type GetColumnFromType(Type type)
|
||||
{
|
||||
if (type == typeof(string))
|
||||
|
Loading…
Reference in New Issue
Block a user