Add comment parsing

This commit is contained in:
Nathan McRae 2024-02-14 18:31:58 -08:00
parent f740e4bad1
commit 6cea9b7e59
2 changed files with 57 additions and 7 deletions

View File

@ -32,6 +32,7 @@ public class SaneTsv
public ColumnType[] ColumnTypes { get; protected set; }
public Dictionary<string, List<object>> Columns { get; protected set; }
public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null;
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
{
@ -59,6 +60,8 @@ public class SaneTsv
var fieldBytes = new List<byte>();
var fields = new List<byte[]>();
var currentComment = new StringBuilder();
int numFields = -1;
int line = 1;
int currentLineStart = 0;
@ -201,6 +204,12 @@ public class SaneTsv
parsed.ColumnTypes[j] = type;
}
if (currentComment.Length > 0)
{
parsed.FileComment = currentComment.ToString();
currentComment.Clear();
}
fields.Clear();
}
else if (numFields != fields.Count)
@ -209,7 +218,14 @@ public class SaneTsv
}
else
{
AddRecord(parsed, fields, line);
string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
}
line++;
@ -217,7 +233,29 @@ public class SaneTsv
}
else if (inputBuffer[i] == '#')
{
throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}");
if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
{
int j = i;
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
if (j < inputBuffer.Length)
{
var commentBytes = new byte[j - i - 1];
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
currentComment.Append("\n");
i = j;
currentLineStart = i + 1;
line++;
}
else
{
throw new Exception("Comments at end of file are not allowed");
}
}
else
{
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
}
}
else
{
@ -237,13 +275,20 @@ public class SaneTsv
}
else
{
AddRecord(parsed, fields, line);
string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
}
return parsed;
}
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields, int line)
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
{
var parsedFields = new object[fields.Count];
for (int j = 0; j < fields.Count; j++)
@ -351,8 +396,7 @@ public class SaneTsv
}
}
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
fields.Clear();
return parsedFields;
}
public SaneTsvRecord this[int i] => Records[i];
@ -360,14 +404,16 @@ public class SaneTsv
public class SaneTsvRecord
{
public SaneTsv Parent { get; }
public string Comment { get; }
public object[] Fields { get; }
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public SaneTsvRecord(SaneTsv parent, object[] fields)
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
{
Parent = parent;
Fields = fields;
Comment = comment;
}
}
}

View File

@ -51,4 +51,8 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
# Commented TSV
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
Comments must be UTF-8 encoded text.
Comments after the last record are an error.