175 lines
4.4 KiB
C#
175 lines
4.4 KiB
C#
using System.Text;
|
|
|
|
namespace NathanMcRae;
|
|
|
|
/// <summary>
|
|
/// Sane Tab-Separated Values
|
|
/// </summary>
|
|
public class SaneTsv
|
|
{
|
|
// TODO: We need to be able to update all these in tandem somehow
|
|
public string[] ColumnNames { get; protected set; }
|
|
public Dictionary<string, List<string>> Columns { get; protected set; }
|
|
public List<SaneTsvRecord> Records { get; protected set; }
|
|
|
|
public static SaneTsv Parse(byte[] inputBuffer)
|
|
{
|
|
var parsed = new SaneTsv();
|
|
parsed.Columns = new Dictionary<string, List<string>>();
|
|
parsed.ColumnNames = new string[] { };
|
|
parsed.Records = new List<SaneTsvRecord>();
|
|
|
|
var fieldBytes = new List<byte>();
|
|
var fields = new List<string>();
|
|
int numFields = -1;
|
|
for (int i = 0; i < inputBuffer.Count(); i++)
|
|
{
|
|
if (inputBuffer[i] == '\\')
|
|
{
|
|
if (i + 1 == inputBuffer.Count())
|
|
{
|
|
throw new Exception($"Found '\\' at end of input");
|
|
}
|
|
if (inputBuffer[i + 1] == 'n')
|
|
{
|
|
fieldBytes.Add((byte)'\n');
|
|
i++;
|
|
}
|
|
else if (inputBuffer[i + 1] == '\\')
|
|
{
|
|
fieldBytes.Add((byte)'\\');
|
|
i++;
|
|
}
|
|
else if (inputBuffer[i + 1] == 't')
|
|
{
|
|
fieldBytes.Add((byte)'\t');
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
|
|
}
|
|
}
|
|
else if (inputBuffer[i] == '\t')
|
|
{
|
|
// end of field
|
|
try
|
|
{
|
|
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
|
|
}
|
|
fieldBytes.Clear();
|
|
}
|
|
else if (inputBuffer[i] == '\n')
|
|
{
|
|
try
|
|
{
|
|
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
|
|
}
|
|
fieldBytes.Clear();
|
|
|
|
if (numFields < 0)
|
|
{
|
|
// This is the header
|
|
|
|
numFields = fields.Count;
|
|
|
|
parsed.ColumnNames = new string[numFields];
|
|
|
|
for (int j = 0; j < fields.Count; j++)
|
|
{
|
|
string columnName = fields[j];
|
|
|
|
try
|
|
{
|
|
parsed.Columns.Add(columnName, new List<string>());
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception($"Column name {columnName} is not unique", e);
|
|
}
|
|
|
|
parsed.ColumnNames[j] = columnName;
|
|
}
|
|
|
|
fields.Clear();
|
|
}
|
|
else if (numFields != fields.Count)
|
|
{
|
|
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
|
|
}
|
|
else
|
|
{
|
|
for (int j = 0; j < fields.Count; j++)
|
|
{
|
|
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
|
|
}
|
|
|
|
parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray()));
|
|
fields.Clear();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fieldBytes.Add(inputBuffer[i]);
|
|
}
|
|
}
|
|
|
|
try
|
|
{
|
|
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
|
|
}
|
|
|
|
if (numFields != fields.Count)
|
|
{
|
|
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
|
|
}
|
|
else
|
|
{
|
|
for (int j = 0; j < fields.Count; j++)
|
|
{
|
|
try
|
|
{
|
|
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new Exception($"Field {j} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
|
|
}
|
|
}
|
|
|
|
parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray()));
|
|
fields.Clear();
|
|
}
|
|
|
|
return parsed;
|
|
}
|
|
|
|
public SaneTsvRecord this[int i] => Records[i];
|
|
|
|
public class SaneTsvRecord
|
|
{
|
|
public SaneTsv Parent { get; }
|
|
public string[] Fields { get; }
|
|
|
|
public string this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
|
|
|
public SaneTsvRecord(SaneTsv parent, string[] fields)
|
|
{
|
|
Parent = parent;
|
|
Fields = fields;
|
|
}
|
|
}
|
|
}
|