sane-tsv/Stsv/Stsv.cs

175 lines
4.4 KiB
C#
Raw Normal View History

2024-02-14 02:56:20 +00:00
using System.Text;
namespace NathanMcRae;
/// <summary>
/// Sane Tab-Separated Values
/// </summary>
public class Stsv
{
// TODO: We need to be able to update all these in tandem somehow
public string[] ColumnNames { get; protected set; }
public Dictionary<string, List<string>> Columns { get; protected set; }
public List<StsvRecord> Records { get; protected set; }
public static Stsv Parse(byte[] inputBuffer)
{
var parsed = new Stsv();
parsed.Columns = new Dictionary<string, List<string>>();
parsed.ColumnNames = new string[] { };
parsed.Records = new List<StsvRecord>();
var fieldBytes = new List<byte>();
var fields = new List<string>();
int numFields = -1;
for (int i = 0; i < inputBuffer.Count(); i++)
{
if (inputBuffer[i] == '\\')
{
if (i + 1 == inputBuffer.Count())
{
throw new Exception($"Found '\\' at end of input");
}
if (inputBuffer[i + 1] == 'n')
{
fieldBytes.Add((byte)'\n');
i++;
}
else if (inputBuffer[i + 1] == '\\')
{
fieldBytes.Add((byte)'\\');
i++;
}
else if (inputBuffer[i + 1] == 't')
{
fieldBytes.Add((byte)'\t');
i++;
}
else
{
throw new Exception($"Expected 'n', 't', or '\\' after '\\' at {i}");
}
}
else if (inputBuffer[i] == '\t')
{
// end of field
try
{
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
}
catch (Exception e)
{
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
}
fieldBytes.Clear();
}
else if (inputBuffer[i] == '\n')
{
try
{
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
}
catch (Exception e)
{
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
}
fieldBytes.Clear();
if (numFields < 0)
{
// This is the header
numFields = fields.Count;
parsed.ColumnNames = new string[numFields];
for (int j = 0; j < fields.Count; j++)
{
string columnName = fields[j];
try
{
parsed.Columns.Add(columnName, new List<string>());
}
catch (Exception e)
{
throw new Exception($"Column name {columnName} is not unique", e);
}
parsed.ColumnNames[j] = columnName;
}
fields.Clear();
}
else if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
}
else
{
for (int j = 0; j < fields.Count; j++)
{
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
}
parsed.Records.Add(new StsvRecord(parsed, fields.ToArray()));
fields.Clear();
}
}
else
{
fieldBytes.Add(inputBuffer[i]);
}
}
try
{
fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
}
catch (Exception e)
{
throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
}
if (numFields != fields.Count)
{
throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
}
else
{
for (int j = 0; j < fields.Count; j++)
{
try
{
parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
}
catch (Exception e)
{
throw new Exception($"Field {j} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
}
}
parsed.Records.Add(new StsvRecord(parsed, fields.ToArray()));
fields.Clear();
}
return parsed;
}
public StsvRecord this[int i] => Records[i];
public class StsvRecord
{
public Stsv Parent { get; }
public string[] Fields { get; }
public string this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public StsvRecord(Stsv parent, string[] fields)
{
Parent = parent;
Fields = fields;
}
}
}