Add comment parsing
This commit is contained in:
parent
f740e4bad1
commit
6cea9b7e59
@ -32,6 +32,7 @@ public class SaneTsv
|
|||||||
public ColumnType[] ColumnTypes { get; protected set; }
|
public ColumnType[] ColumnTypes { get; protected set; }
|
||||||
public Dictionary<string, List<object>> Columns { get; protected set; }
|
public Dictionary<string, List<object>> Columns { get; protected set; }
|
||||||
public List<SaneTsvRecord> Records { get; protected set; }
|
public List<SaneTsvRecord> Records { get; protected set; }
|
||||||
|
public string FileComment { get; protected set; } = null;
|
||||||
|
|
||||||
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
|
public static SaneTsv ParseSaneTsv(byte[] inputBuffer)
|
||||||
{
|
{
|
||||||
@ -59,6 +60,8 @@ public class SaneTsv
|
|||||||
|
|
||||||
var fieldBytes = new List<byte>();
|
var fieldBytes = new List<byte>();
|
||||||
var fields = new List<byte[]>();
|
var fields = new List<byte[]>();
|
||||||
|
var currentComment = new StringBuilder();
|
||||||
|
|
||||||
int numFields = -1;
|
int numFields = -1;
|
||||||
int line = 1;
|
int line = 1;
|
||||||
int currentLineStart = 0;
|
int currentLineStart = 0;
|
||||||
@ -201,6 +204,12 @@ public class SaneTsv
|
|||||||
parsed.ColumnTypes[j] = type;
|
parsed.ColumnTypes[j] = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (currentComment.Length > 0)
|
||||||
|
{
|
||||||
|
parsed.FileComment = currentComment.ToString();
|
||||||
|
currentComment.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
fields.Clear();
|
fields.Clear();
|
||||||
}
|
}
|
||||||
else if (numFields != fields.Count)
|
else if (numFields != fields.Count)
|
||||||
@ -209,7 +218,14 @@ public class SaneTsv
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
AddRecord(parsed, fields, line);
|
string comment = null;
|
||||||
|
if (currentComment.Length > 0)
|
||||||
|
{
|
||||||
|
comment = currentComment.ToString();
|
||||||
|
currentComment.Clear();
|
||||||
|
}
|
||||||
|
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||||
|
fields.Clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
line++;
|
line++;
|
||||||
@ -217,7 +233,29 @@ public class SaneTsv
|
|||||||
}
|
}
|
||||||
else if (inputBuffer[i] == '#')
|
else if (inputBuffer[i] == '#')
|
||||||
{
|
{
|
||||||
throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}");
|
if (i == currentLineStart && format >= FormatType.COMMENTED_TSV)
|
||||||
|
{
|
||||||
|
int j = i;
|
||||||
|
for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { }
|
||||||
|
if (j < inputBuffer.Length)
|
||||||
|
{
|
||||||
|
var commentBytes = new byte[j - i - 1];
|
||||||
|
Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1);
|
||||||
|
currentComment.Append(Encoding.UTF8.GetString(commentBytes));
|
||||||
|
currentComment.Append("\n");
|
||||||
|
i = j;
|
||||||
|
currentLineStart = i + 1;
|
||||||
|
line++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception("Comments at end of file are not allowed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -237,13 +275,20 @@ public class SaneTsv
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
AddRecord(parsed, fields, line);
|
string comment = null;
|
||||||
|
if (currentComment.Length > 0)
|
||||||
|
{
|
||||||
|
comment = currentComment.ToString();
|
||||||
|
currentComment.Clear();
|
||||||
|
}
|
||||||
|
parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment));
|
||||||
|
fields.Clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static void AddRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
protected static object[] ParseCurrentRecord(SaneTsv parsed, List<byte[]> fields, int line)
|
||||||
{
|
{
|
||||||
var parsedFields = new object[fields.Count];
|
var parsedFields = new object[fields.Count];
|
||||||
for (int j = 0; j < fields.Count; j++)
|
for (int j = 0; j < fields.Count; j++)
|
||||||
@ -351,8 +396,7 @@ public class SaneTsv
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
|
return parsedFields;
|
||||||
fields.Clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public SaneTsvRecord this[int i] => Records[i];
|
public SaneTsvRecord this[int i] => Records[i];
|
||||||
@ -360,14 +404,16 @@ public class SaneTsv
|
|||||||
public class SaneTsvRecord
|
public class SaneTsvRecord
|
||||||
{
|
{
|
||||||
public SaneTsv Parent { get; }
|
public SaneTsv Parent { get; }
|
||||||
|
public string Comment { get; }
|
||||||
public object[] Fields { get; }
|
public object[] Fields { get; }
|
||||||
|
|
||||||
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
|
||||||
|
|
||||||
public SaneTsvRecord(SaneTsv parent, object[] fields)
|
public SaneTsvRecord(SaneTsv parent, object[] fields, string comment)
|
||||||
{
|
{
|
||||||
Parent = parent;
|
Parent = parent;
|
||||||
Fields = fields;
|
Fields = fields;
|
||||||
|
Comment = comment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,4 +51,8 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
|
|||||||
|
|
||||||
# Commented TSV
|
# Commented TSV
|
||||||
|
|
||||||
|
Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped.
|
||||||
|
|
||||||
|
Comments must be UTF-8 encoded text.
|
||||||
|
|
||||||
Comments after the last record are an error.
|
Comments after the last record are an error.
|
Loading…
Reference in New Issue
Block a user