Add comment parsing

This commit is contained in:
Nathan McRae 2024-02-14 18:31:58 -08:00
parent 58278d0f53
commit 8673e6a7df
2 changed files with 57 additions and 7 deletions

View File

@ -32,6 +32,7 @@ public class SaneTsv
public ColumnType[] ColumnTypes { get; protected set; } public ColumnType[] ColumnTypes { get; protected set; }
public Dictionary<string, List<object>> Columns { get; protected set; } public Dictionary<string, List<object>> Columns { get; protected set; }
public List<SaneTsvRecord> Records { get; protected set; } public List<SaneTsvRecord> Records { get; protected set; }
public string FileComment { get; protected set; } = null;
public static SaneTsv ParseSaneTsv(byte[] inputBuffer) public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
{ {
@ -59,6 +60,8 @@ public class SaneTsv
var fieldBytes = new List<byte>(); var fieldBytes = new List<byte>();
var fields = new List<byte[]>(); var fields = new List<byte[]>();
var currentComment = new StringBuilder();
int numFields = -1; int numFields = -1;
int line = 1; int line = 1;
int currentLineStart = 0; int currentLineStart = 0;
@ -201,6 +204,12 @@ public class SaneTsv
parsed.ColumnTypes[j] = type; parsed.ColumnTypes[j] = type;
} }
if (currentComment.Length > 0)
{
parsed.FileComment = currentComment.ToString();
currentComment.Clear();
}
fields.Clear(); fields.Clear();
} }
else if (numFields != fields.Count) else if (numFields != fields.Count)
@ -209,7 +218,14 @@ public class SaneTsv
} }
else else
{ {
AddRecord(parsed, fields, line); string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
} }
line++; line++;
@ -217,7 +233,29 @@ public class SaneTsv
} }
else if (inputBuffer[i] == '#') else if (inputBuffer[i] == '#')
{ {
throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}"); if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
{
int j = i;
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
if (j < inputBuffer.Length)
{
var commentBytes = new byte[j - i - 1];
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
currentComment.Append("\n");
i = j;
currentLineStart = i + 1;
line++;
}
else
{
throw new Exception("Comments at end of file are not allowed");
}
}
else
{
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
}
} }
else else
{ {
@ -237,13 +275,20 @@ public class SaneTsv
} }
else else
{ {
AddRecord(parsed, fields, line); string comment = null;
if (currentComment.Length > 0)
{
comment = currentComment.ToString();
currentComment.Clear();
}
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
fields.Clear();
} }
return parsed; return parsed;
} }
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields, int line) protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
{ {
var parsedFields = new object[fields.Count]; var parsedFields = new object[fields.Count];
for (int j = 0; j < fields.Count; j++) for (int j = 0; j < fields.Count; j++)
@ -351,8 +396,7 @@ public class SaneTsv
} }
} }
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields)); return parsedFields;
fields.Clear();
} }
public SaneTsvRecord this[int i] => Records[i]; public SaneTsvRecord this[int i] => Records[i];
@ -360,14 +404,16 @@ public class SaneTsv
public class SaneTsvRecord public class SaneTsvRecord
{ {
public SaneTsv Parent { get; } public SaneTsv Parent { get; }
public string Comment { get; }
public object[] Fields { get; } public object[] Fields { get; }
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
public SaneTsvRecord(SaneTsv parent, object[] fields) public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
{ {
Parent = parent; Parent = parent;
Fields = fields; Fields = fields;
Comment = comment;
} }
} }
} }

View File

@ -51,4 +51,8 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
# Commented TSV # Commented TSV
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
Comments must be UTF-8 encoded text.
Comments after the last record are an error. Comments after the last record are an error.