From 55fa00a6e7647521deda0b6c9e88666992266be6 Mon Sep 17 00:00:00 2001 From: Nathan McRae Date: Sat, 9 Mar 2024 13:13:41 -0800 Subject: [PATCH] Clarify terminology of 'header'/'column' The header is just the region containing the column name / types. --- SaneTsv/SaneTsv.cs | 122 ++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/SaneTsv/SaneTsv.cs b/SaneTsv/SaneTsv.cs index 9a15281..84c1149 100644 --- a/SaneTsv/SaneTsv.cs +++ b/SaneTsv/SaneTsv.cs @@ -87,9 +87,9 @@ public class SaneTsv } parsed.Records = new List(); - var headerTypes = new List(); - var headerNames = new List(); - var headerPropertyInfos = new List(); + var columnTypes = new List(); + var columnNames = new List(); + var columnPropertyInfos = new List(); int columnCount = 0; foreach (PropertyInfo property in typeof(T).GetProperties()) @@ -100,9 +100,9 @@ public class SaneTsv continue; } - headerNames.Add(attribute.ColumnName ?? property.Name); - headerTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType)); - headerPropertyInfos.Add(property); + columnNames.Add(attribute.ColumnName ?? property.Name); + columnTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType)); + columnPropertyInfos.Add(property); // TODO: Check that the property type and given column type are compatible columnCount++; } @@ -169,7 +169,7 @@ public class SaneTsv } catch (Exception e) { - throw new Exception($"Header {fields.Count} is not valid UTF-8", e); + throw new Exception($"Header field {fields.Count} is not valid UTF-8", e); } string columnTypeString; @@ -178,7 +178,7 @@ public class SaneTsv { if (format == FormatType.SIMPLE_TSV) { - throw new Exception($"Header {j} contains ':', which is not allowed for column names"); + throw new Exception($"Header field {j} contains ':', which is not allowed for column names"); } columnTypeString = columnString.Split(":").Last(); columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); @@ -187,7 +187,7 @@ public class SaneTsv { if (format > FormatType.SIMPLE_TSV) { - throw new Exception($"Header {fields.Count} has no type"); + throw new Exception($"Header field {fields.Count} has no type"); } columnTypeString = ""; columnName = columnString; @@ -240,14 +240,14 @@ public class SaneTsv // TODO: Allow lax parsing (only worry about parsing columns that are given in the specifying type - if (headerNames[j] != columnName) + if (columnNames[j] != columnName) { - throw new Exception($"Column {j} has name {columnName}, but expected {headerNames[j]}"); + throw new Exception($"Column {j} has name {columnName}, but expected {columnNames[j]}"); } - if (headerTypes[j] != type) + if (columnTypes[j] != type) { - throw new Exception($"Column {j} has type {type}, but expected {headerTypes[j]}"); + throw new Exception($"Column {j} has type {type}, but expected {columnTypes[j]}"); } } @@ -311,7 +311,7 @@ public class SaneTsv // Complication: it probably depends on processor count if (inputBuffer.Length < 10000) { - parsed.Records.AddRange(Parse(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), currentLineStart - 1, inputBuffer.Length)); + parsed.Records.AddRange(Parse(inputBuffer, format, columnPropertyInfos.ToArray(), columnTypes.ToArray(), currentLineStart - 1, inputBuffer.Length)); return parsed; } else @@ -333,7 +333,7 @@ public class SaneTsv endIndex = (i + 1) * splitCount + parseStart; } - parsedValues[i] = Parse(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), startIndex, endIndex); + parsedValues[i] = Parse(inputBuffer, format, columnPropertyInfos.ToArray(), columnTypes.ToArray(), startIndex, endIndex); }); // TODO: Handle relative line numbers @@ -351,7 +351,7 @@ public class SaneTsv // startIndex is in we'd have to go back to the start of the record's comment, and to know // exactly where that comment started we'd have to go back to the start of the record before that // (not including that other record's comment). - protected static T[] Parse(byte[] inputBuffer, FormatType format, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, int startIndex, int endIndex) where T : TsvRecord, new() + protected static T[] Parse(byte[] inputBuffer, FormatType format, PropertyInfo[] columnPropertyInfos, Type[] columnTypes, int startIndex, int endIndex) where T : TsvRecord, new() { var fieldBytes = new List(); var fields = new List(); @@ -421,9 +421,9 @@ public class SaneTsv fields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); - if (headerTypes.Length != fields.Count) + if (columnTypes.Length != fields.Count) { - throw new Exception($"Expected {headerTypes.Length} fields on line {relativeLine}, but found {fields.Count}"); + throw new Exception($"Expected {columnTypes.Length} fields on line {relativeLine}, but found {fields.Count}"); } else { @@ -433,7 +433,7 @@ public class SaneTsv comment = currentComment.ToString(); currentComment.Clear(); } - parsed.Add(ParseCurrentRecord(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, relativeLine)); + parsed.Add(ParseCurrentRecord(columnTypes.ToArray(), columnPropertyInfos.ToArray(), fields, comment, relativeLine)); fields.Clear(); } @@ -493,9 +493,9 @@ public class SaneTsv // TODO throw new Exception("Not sure when this will happen. THis might actuall be fine"); } - if (fields.Count != headerTypes.Length) + if (fields.Count != columnTypes.Length) { - throw new Exception($"Expected {headerTypes} fields on line {relativeLine}, but found {fields.Count}"); + throw new Exception($"Expected {columnTypes} fields on line {relativeLine}, but found {fields.Count}"); } else { @@ -505,7 +505,7 @@ public class SaneTsv comment = currentComment.ToString(); currentComment.Clear(); } - parsed.Add(ParseCurrentRecord(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, relativeLine)); + parsed.Add(ParseCurrentRecord(columnTypes.ToArray(), columnPropertyInfos.ToArray(), fields, comment, relativeLine)); fields.Clear(); } @@ -846,7 +846,7 @@ public class SaneTsv public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer) { string[] columnNames = null; - var headers = new List(); + var headerFields = new List(); var fieldBytes = new List(); int startOfData = -1; for (int i = 0; i < inputBuffer.Count(); i++) @@ -885,33 +885,33 @@ public class SaneTsv else if (inputBuffer[i] == '\t') { // end of field - headers.Add(fieldBytes.ToArray()); + headerFields.Add(fieldBytes.ToArray()); fieldBytes.Clear(); } else if (inputBuffer[i] == '\n') { // This is the end of the header - headers.Add(fieldBytes.ToArray()); + headerFields.Add(fieldBytes.ToArray()); startOfData = i + 1; - columnNames = new string[headers.Count]; + columnNames = new string[headerFields.Count]; fieldBytes.Clear(); - for (int j = 0; j < headers.Count; j++) + for (int j = 0; j < headerFields.Count; j++) { string columnString; try { - columnString = Encoding.UTF8.GetString(headers[j]); + columnString = Encoding.UTF8.GetString(headerFields[j]); } catch (Exception e) { - throw new Exception($"Column {headers.Count} name is not valid UTF-8", e); + throw new Exception($"Column {headerFields.Count} name is not valid UTF-8", e); } if (columnString.Contains(':')) { - throw new Exception($"Header {headers.Count} contain ':', which is not allowed for column names"); + throw new Exception($"Header field {headerFields.Count} contain ':', which is not allowed for column names"); } columnNames[j] = columnString; @@ -1184,9 +1184,9 @@ public class SaneTsv { var bytes = new List(); - var headerTypes = new List(); - var headerNames = new List(); - var headerPropertyInfos = new List(); + var columnTypes = new List(); + var columnNames = new List(); + var columnPropertyInfos = new List(); int columnCount = 0; // Serialize header @@ -1199,30 +1199,30 @@ public class SaneTsv } string headerName = attribute.ColumnName ?? property.Name; - headerNames.Add(headerName); + columnNames.Add(headerName); Type headerType = attribute.ColumnType ?? GetColumnFromType(property.PropertyType); if (tsvFormat == FormatType.SIMPLE_TSV && headerType != typeof(StringType)) { throw new Exception($"Serializing Simple TSV requires all columns be of type string, but column '{headerName}' has type '{headerType}'"); } - headerTypes.Add(headerType); - headerPropertyInfos.Add(property); + columnTypes.Add(headerType); + columnPropertyInfos.Add(property); // TODO: Check that the property type and given column type are compatible columnCount++; } // Serialize header - for (int i = 0; i < headerNames.Count; i++) + for (int i = 0; i < columnNames.Count; i++) { - for (int j = i + 1; j < headerNames.Count; j++) + for (int j = i + 1; j < columnNames.Count; j++) { - if (headerNames[i] == headerNames[j]) + if (columnNames[i] == columnNames[j]) { throw new Exception("Column names in header must be unique"); } } - byte[] nameEncoded = Encoding.UTF8.GetBytes(headerNames[i]); + byte[] nameEncoded = Encoding.UTF8.GetBytes(columnNames[i]); for (int j = 0; j < nameEncoded.Length; j++) { @@ -1257,15 +1257,15 @@ public class SaneTsv bytes.Add((byte)':'); try { - bytes.AddRange(Encoding.UTF8.GetBytes(GetNameFromColumn(headerTypes[i]))); + bytes.AddRange(Encoding.UTF8.GetBytes(GetNameFromColumn(columnTypes[i]))); } catch (Exception e) { - throw new Exception($"Invalid header type for column {i}", e); + throw new Exception($"Invalid column type for column {i}", e); } } - if (i == headerNames.Count - 1) + if (i == columnNames.Count - 1) { bytes.Add((byte)'\n'); } @@ -1276,19 +1276,19 @@ public class SaneTsv } // Serialize data - SerializeTsv(data, bytes, headerPropertyInfos.ToArray(), headerTypes.ToArray(), tsvFormat, 0, data.Count); + SerializeTsv(data, bytes, columnPropertyInfos.ToArray(), columnTypes.ToArray(), tsvFormat, 0, data.Count); return bytes.ToArray(); } - protected static void SerializeTsv(IList data, List bytes, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, FormatType tsvFormat, int startIndex, int endIndex) + protected static void SerializeTsv(IList data, List bytes, PropertyInfo[] columnPropertyInfos, Type[] columnTypes, FormatType tsvFormat, int startIndex, int endIndex) { // Serialize data for (int i = 0; i < data.Count; i++) { - for (int j = 0; j < headerTypes.Length; j++) + for (int j = 0; j < columnTypes.Length; j++) { - object datum = headerPropertyInfos[j].GetValue(data[i]); + object datum = columnPropertyInfos[j].GetValue(data[i]); try { @@ -1296,16 +1296,16 @@ public class SaneTsv // Some fields definitely don't need escaping, so we add them directly to bytes bool skipEscaping = false; - if (headerTypes[j] == typeof(StringType)) + if (columnTypes[j] == typeof(StringType)) { fieldEncoded = Encoding.UTF8.GetBytes((string)datum); } - else if (headerTypes[j] == typeof(BooleanType)) + else if (columnTypes[j] == typeof(BooleanType)) { bytes.AddRange((bool)datum ? TrueEncoded : FalseEncoded); skipEscaping = true; } - else if (headerTypes[j] == typeof(Float32Type)) + else if (columnTypes[j] == typeof(Float32Type)) { if (datum is float f) { @@ -1329,7 +1329,7 @@ public class SaneTsv } skipEscaping = true; } - else if (headerTypes[j] == typeof(Float32LEType)) + else if (columnTypes[j] == typeof(Float32LEType)) { if (LittleEndian) { @@ -1345,7 +1345,7 @@ public class SaneTsv } } } - else if (headerTypes[j] == typeof(Float64Type)) + else if (columnTypes[j] == typeof(Float64Type)) { if (datum is double d) { @@ -1369,7 +1369,7 @@ public class SaneTsv } skipEscaping = true; } - else if (headerTypes[j] == typeof(Float64LEType)) + else if (columnTypes[j] == typeof(Float64LEType)) { if (LittleEndian) { @@ -1385,33 +1385,33 @@ public class SaneTsv } } } - else if (headerTypes[j] == typeof(UInt32Type)) + else if (columnTypes[j] == typeof(UInt32Type)) { bytes.AddRange(Encoding.UTF8.GetBytes(((UInt32)datum).ToString())); skipEscaping = true; } - else if (headerTypes[j] == typeof(UInt64Type)) + else if (columnTypes[j] == typeof(UInt64Type)) { bytes.AddRange(Encoding.UTF8.GetBytes(((UInt64)datum).ToString())); skipEscaping = true; } - else if (headerTypes[j] == typeof(Int32Type)) + else if (columnTypes[j] == typeof(Int32Type)) { bytes.AddRange(Encoding.UTF8.GetBytes(((Int32)datum).ToString())); skipEscaping = true; } - else if (headerTypes[j] == typeof(Int64Type)) + else if (columnTypes[j] == typeof(Int64Type)) { bytes.AddRange(Encoding.UTF8.GetBytes(((Int64)datum).ToString())); skipEscaping = true; } - else if (headerTypes[j] == typeof(BinaryType)) + else if (columnTypes[j] == typeof(BinaryType)) { fieldEncoded = (byte[])datum; } else { - throw new Exception($"Unexpected column type {headerTypes[j]} for column {j}"); + throw new Exception($"Unexpected column type {columnTypes[j]} for column {j}"); } if (!skipEscaping) @@ -1445,7 +1445,7 @@ public class SaneTsv } } - if (j < headerTypes.Length - 1) + if (j < columnTypes.Length - 1) { bytes.Add((byte)'\t'); } @@ -1456,7 +1456,7 @@ public class SaneTsv } catch (InvalidCastException e) { - throw new Exception($"Record {i}, field {j} expected type compatible with {GetNameFromColumn(headerTypes[j])}", e); + throw new Exception($"Record {i}, field {j} expected type compatible with {GetNameFromColumn(columnTypes[j])}", e); } } }