diff --git a/SaneTsv.cs b/SaneTsv.cs index 7966bcc..7eccd88 100644 --- a/SaneTsv.cs +++ b/SaneTsv.cs @@ -32,6 +32,7 @@ public class SaneTsv public ColumnType[] ColumnTypes { get; protected set; } public Dictionary> Columns { get; protected set; } public List Records { get; protected set; } + public string FileComment { get; protected set; } = null; public static SaneTsv ParseSaneTsv(byte[] inputBuffer) { @@ -59,6 +60,8 @@ public class SaneTsv var fieldBytes = new List(); var fields = new List(); + var currentComment = new StringBuilder(); + int numFields = -1; int line = 1; int currentLineStart = 0; @@ -201,6 +204,12 @@ public class SaneTsv parsed.ColumnTypes[j] = type; } + if (currentComment.Length > 0) + { + parsed.FileComment = currentComment.ToString(); + currentComment.Clear(); + } + fields.Clear(); } else if (numFields != fields.Count) @@ -209,7 +218,14 @@ public class SaneTsv } else { - AddRecord(parsed, fields, line); + string comment = null; + if (currentComment.Length > 0) + { + comment = currentComment.ToString(); + currentComment.Clear(); + } + parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment)); + fields.Clear(); } line++; @@ -217,7 +233,29 @@ public class SaneTsv } else if (inputBuffer[i] == '#') { - throw new Exception($"Found unescaped '#' at column {i - currentLineStart}, line {line}"); + if (i == currentLineStart && format >= FormatType.COMMENTED_TSV) + { + int j = i; + for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { } + if (j < inputBuffer.Length) + { + var commentBytes = new byte[j - i - 1]; + Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1); + currentComment.Append(Encoding.UTF8.GetString(commentBytes)); + currentComment.Append("\n"); + i = j; + currentLineStart = i + 1; + line++; + } + else + { + throw new Exception("Comments at end of file are not allowed"); + } + } + else + { + throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}"); + } } else { @@ -237,13 +275,20 @@ public class SaneTsv } else { - AddRecord(parsed, fields, line); + string comment = null; + if (currentComment.Length > 0) + { + comment = currentComment.ToString(); + currentComment.Clear(); + } + parsed.Records.Add(new SaneTsvRecord(parsed, ParseCurrentRecord(parsed, fields, line), comment)); + fields.Clear(); } return parsed; } - protected static void AddRecord(SaneTsv parsed, List fields, int line) + protected static object[] ParseCurrentRecord(SaneTsv parsed, List fields, int line) { var parsedFields = new object[fields.Count]; for (int j = 0; j < fields.Count; j++) @@ -351,8 +396,7 @@ public class SaneTsv } } - parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields)); - fields.Clear(); + return parsedFields; } public SaneTsvRecord this[int i] => Records[i]; @@ -360,14 +404,16 @@ public class SaneTsv public class SaneTsvRecord { public SaneTsv Parent { get; } + public string Comment { get; } public object[] Fields { get; } public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)]; - public SaneTsvRecord(SaneTsv parent, object[] fields) + public SaneTsvRecord(SaneTsv parent, object[] fields, string comment) { Parent = parent; Fields = fields; + Comment = comment; } } } diff --git a/readme.md b/readme.md index 2ccdde7..5a94db7 100644 --- a/readme.md +++ b/readme.md @@ -51,4 +51,8 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each # Commented TSV +Commented lines start with a '#' character at the beginning of the line. Unescaped '#' characters are not allowed on a line that does not start with a '#'. Any '#' characters in fields must be escaped. + +Comments must be UTF-8 encoded text. + Comments after the last record are an error. \ No newline at end of file