Add comment parsing
This commit is contained in:
parent
f740e4bad1
commit
6cea9b7e59
@ -32,6 +32,7 @@ public class SaneTsv
|
||||
public ColumnType[] ColumnTypes { get; protected set; }
|
||||
public Dictionary<string, List<object>> Columns { get; protected set; }
|
||||
public List<SaneTsvRecord> Records { get; protected set; }
|
||||
public string FileComment { get; protected set; } = null;
|
||||
|
||||
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
|
||||
{
|
||||
@ -59,6 +60,8 @@ public class SaneTsv
|
||||
|
||||
var fieldBytes = new List<byte>();
|
||||
var fields = new List<byte[]>();
|
||||
var currentComment = new StringBuilder();
|
||||
|
||||
int numFields = -1;
|
||||
int line = 1;
|
||||
int currentLineStart = 0;
|
||||
@ -201,6 +204,12 @@ public class SaneTsv
|
||||
parsed.ColumnTypes[j] = type;
|
||||
}
|
||||
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
parsed.FileComment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
|
||||
fields.Clear();
|
||||
}
|
||||
else if (numFields != fields.Count)
|
||||
@ -209,7 +218,14 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
AddRecord(parsed, fields, line);
|
||||
string comment = null;
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
line++;
|
||||
@ -217,7 +233,29 @@ public class SaneTsv
|
||||
}
|
||||
else if (inputBuffer[i] == '#')
|
||||
{
|
||||
throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}");
|
||||
if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
|
||||
{
|
||||
int j = i;
|
||||
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
|
||||
if (j < inputBuffer.Length)
|
||||
{
|
||||
var commentBytes = new byte[j - i - 1];
|
||||
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
|
||||
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
|
||||
currentComment.Append("\n");
|
||||
i = j;
|
||||
currentLineStart = i + 1;
|
||||
line++;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception("Comments at end of file are not allowed");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -237,13 +275,20 @@ public class SaneTsv
|
||||
}
|
||||
else
|
||||
{
|
||||
AddRecord(parsed, fields, line);
|
||||
string comment = null;
|
||||
if (currentComment.Length > 0)
|
||||
{
|
||||
comment = currentComment.ToString();
|
||||
currentComment.Clear();
|
||||
}
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||
fields.Clear();
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
||||
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
||||
{
|
||||
var parsedFields = new object[fields.Count];
|
||||
for (int j = 0; j < fields.Count; j++)
|
||||
@ -351,8 +396,7 @@ public class SaneTsv
|
||||
}
|
||||
}
|
||||
|
||||
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
|
||||
fields.Clear();
|
||||
return parsedFields;
|
||||
}
|
||||
|
||||
public SaneTsvRecord this[int i] => Records[i];
|
||||
@ -360,14 +404,16 @@ public class SaneTsv
|
||||
public class SaneTsvRecord
|
||||
{
|
||||
public SaneTsv Parent { get; }
|
||||
public string Comment { get; }
|
||||
public object[] Fields { get; }
|
||||
|
||||
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
||||
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields)
|
||||
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
|
||||
{
|
||||
Parent = parent;
|
||||
Fields = fields;
|
||||
Comment = comment;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -51,4 +51,8 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
|
||||
|
||||
# Commented TSV
|
||||
|
||||
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
|
||||
|
||||
Comments must be UTF-8 encoded text.
|
||||
|
||||
Comments after the last record are an error.
|
Loading…
Reference in New Issue
Block a user