Add freeform ParseSimpleTsv

No column spec, just parse whatever you get
This commit is contained in:
Nathan McRae 2024-02-22 23:16:35 -08:00
parent 695ad1f110
commit 0b213c98c2

View File

@ -644,6 +644,157 @@ public class SaneTsv
return Encoding.UTF8.GetBytes(escapedString.ToString());
}
public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer)
{
string[] columnNames = null;
var fieldBytes = new List<byte>();
var fields = new List<byte[]>();
var records = new List<string[]>();
int numFields = -1;
int line = 1;
int currentLineStart = 0;
for (int i = 0; i < inputBuffer.Count(); i++)
{
if (inputBuffer[i] == '\\')
{
if (i + 1 == inputBuffer.Count())
{
throw new Exception($"Found '\\' at end of input");
}
if (inputBuffer[i + 1] == 'n')
{
fieldBytes.Add((byte)'\n');
i++;
}
else if (inputBuffer[i + 1] == '\\')
{
fieldBytes.Add((byte)'\\');
i++;
}
else if (inputBuffer[i + 1] == 't')
{
fieldBytes.Add((byte)'\t');
i++;
}
else if (inputBuffer[i + 1] == '#')
{
fieldBytes.Add((byte)'#');
i++;
}
else
{
throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}");
}
}
else if (inputBuffer[i] == '\t')
{
// end of field
fields.Add(fieldBytes.ToArray());
fieldBytes.Clear();
}
else if (inputBuffer[i] == '\n')
{
fields.Add(fieldBytes.ToArray());
fieldBytes.Clear();
if (numFields < 0)
{
// This is the header
numFields = fields.Count;
columnNames = new string[numFields];
for (int j = 0; j < fields.Count; j++)
{
string columnString;
try
{
columnString = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Column name {fields.Count} is not valid UTF-8", e);
}
if (columnString.Contains(':'))
{
throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names");
}
columnNames[j] = columnString;
}
fields.Clear();
}
else if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
var fieldStrings = new string[fields.Count];
for (int j = 0; j < fields.Count; j++)
{
try
{
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
}
}
records.Add(fieldStrings);
fields.Clear();
}
line++;
currentLineStart = i + 1;
}
else if (inputBuffer[i] == '#')
{
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
}
else
{
fieldBytes.Add(inputBuffer[i]);
}
}
fields.Add(fieldBytes.ToArray());
if (numFields == 0)
{
throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
}
if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}");
}
else
{
var fieldStrings = new string[fields.Count];
for (int j = 0; j < fields.Count; j++)
{
try
{
fieldStrings[j] = Encoding.UTF8.GetString(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Line {line}, column {j} is not valid UTF-8", e);
}
}
records.Add(fieldStrings);
fields.Clear();
}
return (columnNames, records.ToArray());
}
public static Type GetColumnFromType(Type type)
{
if (type == typeof(string))