From a66f6a1368e2274fad5e1377b56869e3d8ae9560 Mon Sep 17 00:00:00 2001 From: Nathan McRae Date: Fri, 8 Mar 2024 12:31:40 -0800 Subject: [PATCH] Make parallel implementations the default --- SaneTsv.cs | 826 +---------------------------------------- SaneTsvTest/Program.cs | 216 ----------- 2 files changed, 11 insertions(+), 1031 deletions(-) diff --git a/SaneTsv.cs b/SaneTsv.cs index 21a4945..366f972 100644 --- a/SaneTsv.cs +++ b/SaneTsv.cs @@ -75,7 +75,6 @@ public class SaneTsv return (CommentedTsv)Parse(inputBuffer, FormatType.COMMENTED_TSV); } - // TODO: Have parsing errors include line / column # protected static Tsv Parse(byte[] inputBuffer, FormatType format) where T : TsvRecord, new() { Tsv parsed; @@ -113,285 +112,6 @@ public class SaneTsv var fields = new List(); var currentComment = new StringBuilder(); - int numFields = -1; - int line = 1; - int currentLineStart = 0; - for (int i = 0; i < inputBuffer.Count(); i++) - { - if (inputBuffer[i] == '\\') - { - if (i + 1 == inputBuffer.Count()) - { - throw new Exception($"Found '\\' at end of input"); - } - if (inputBuffer[i + 1] == 'n') - { - fieldBytes.Add((byte)'\n'); - i++; - } - else if (inputBuffer[i + 1] == '\\') - { - fieldBytes.Add((byte)'\\'); - i++; - } - else if (inputBuffer[i + 1] == 't') - { - fieldBytes.Add((byte)'\t'); - i++; - } - else if (inputBuffer[i + 1] == '#') - { - fieldBytes.Add((byte)'#'); - i++; - } - else - { - throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}"); - } - } - else if (inputBuffer[i] == '\t') - { - // end of field - fields.Add(fieldBytes.ToArray()); - fieldBytes.Clear(); - } - else if (inputBuffer[i] == '\n') - { - fields.Add(fieldBytes.ToArray()); - fieldBytes.Clear(); - - if (numFields < 0) - { - // This is the header - - numFields = fields.Count; - - int numTypesBlank = 0; - - for (int j = 0; j < fields.Count; j++) - { - string columnString; - try - { - columnString = Encoding.UTF8.GetString(fields[j]); - } - catch (Exception e) - { - throw new Exception($"Header {fields.Count} is not valid UTF-8", e); - } - - string columnTypeString; - string columnName; - if (columnString.Contains(':')) - { - if (format == FormatType.SIMPLE_TSV) - { - throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); - } - columnTypeString = columnString.Split(":").Last(); - columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1); - } - else - { - if (format > FormatType.SIMPLE_TSV) - { - throw new Exception($"Header {fields.Count} has no type"); - } - columnTypeString = ""; - columnName = columnString; - } - - Type type; - - switch (columnTypeString) - { - case "": - numTypesBlank++; - type = typeof(StringType); - break; - case "string": - type = typeof(StringType); - break; - case "boolean": - type = typeof(BooleanType); - break; - case "float32": - type = typeof(Float32Type); - break; - case "float32-le": - type = typeof(Float32LEType); - break; - case "float64": - type = typeof(Float64Type); - break; - case "float64-le": - type = typeof(Float64LEType); - break; - case "uint32": - type = typeof(UInt32Type); - break; - case "uint64": - type = typeof(UInt64Type); - break; - case "int32": - type = typeof(Int32Type); - break; - case "int64": - type = typeof(Int64Type); - break; - case "binary": - type = typeof(BinaryType); - break; - default: - throw new Exception($"Invalid type '{columnTypeString}' for column {j}"); - } - - // TODO: Check column name uniqueness - // TODO: Allow lax parsing (only worry about parsing columns that are given in the specifying type - - if (headerNames[j] != columnName) - { - throw new Exception($"Column {j} has name {columnName}, but expected {headerNames[j]}"); - } - - if (headerTypes[j] != type) - { - throw new Exception($"Column {j} has type {type}, but expected {headerTypes[j]}"); - } - } - - if (currentComment.Length > 0) - { - if (parsed is CommentedTsv commentedParsed) - { - commentedParsed.FileComment = currentComment.ToString(); - currentComment.Clear(); - } - else - { - throw new Exception("Found a file comment, but parser wasn't expecting a comment"); - } - } - - - fields.Clear(); - } - else if (numFields != fields.Count) - { - throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); - } - else - { - string comment = null; - if (currentComment.Length > 0) - { - comment = currentComment.ToString(); - currentComment.Clear(); - } - parsed.Records.Add(ParseCurrentRecord(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, line)); - fields.Clear(); - } - - line++; - currentLineStart = i + 1; - } - else if (inputBuffer[i] == '#') - { - if (i == currentLineStart && format >= FormatType.COMMENTED_TSV) - { - int j = i; - for (; j < inputBuffer.Length && inputBuffer[j] != '\n'; j++) { } - if (j < inputBuffer.Length) - { - var commentBytes = new byte[j - i - 1]; - Array.Copy(inputBuffer, i + 1, commentBytes, 0, j - i - 1); - if (currentComment.Length > 0) - { - currentComment.Append('\n'); - } - currentComment.Append(Encoding.UTF8.GetString(commentBytes)); - i = j; - currentLineStart = i + 1; - line++; - } - else - { - throw new Exception("Comments at end of file are not allowed"); - } - } - else - { - throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}"); - } - } - else - { - fieldBytes.Add(inputBuffer[i]); - } - } - - fields.Add(fieldBytes.ToArray()); - - if (fields.Count == 0) - { - throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); - } - if (numFields != fields.Count) - { - throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}"); - } - else - { - string comment = null; - if (currentComment.Length > 0) - { - comment = currentComment.ToString(); - currentComment.Clear(); - } - parsed.Records.Add(ParseCurrentRecord(headerTypes.ToArray(), headerPropertyInfos.ToArray(), fields, comment, line)); - fields.Clear(); - } - - return parsed; - } - - protected static Tsv ParseParallel(byte[] inputBuffer, FormatType format) where T : TsvRecord, new() - { - Tsv parsed; - if (format == FormatType.COMMENTED_TSV) - { - parsed = new CommentedTsv(); - } - else - { - parsed = new Tsv(); - } - parsed.Records = new List(); - - var headerTypes = new List(); - var headerNames = new List(); - var headerPropertyInfos = new List(); - int columnCount = 0; - - foreach (PropertyInfo property in typeof(T).GetProperties()) - { - TypedTsvColumnAttribute attribute = (TypedTsvColumnAttribute)Attribute.GetCustomAttribute(property, typeof(TypedTsvColumnAttribute)); - if (attribute == null) - { - continue; - } - - headerNames.Add(attribute.ColumnName ?? property.Name); - headerTypes.Add(attribute.ColumnType ?? GetColumnFromType(property.PropertyType)); - headerPropertyInfos.Add(property); - // TODO: Check that the property type and given column type are compatible - columnCount++; - } - - var fieldBytes = new List(); - var fields = new List(); - var currentComment = new StringBuilder(); - int numFields = -1; int line = 1; int currentLineStart = 0; @@ -592,7 +312,7 @@ public class SaneTsv // Complication: it probably depends on processor count if (inputBuffer.Length < 10000) { - parsed.Records.AddRange(ParseParallel(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), currentLineStart - 1, inputBuffer.Length)); + parsed.Records.AddRange(Parse(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), currentLineStart - 1, inputBuffer.Length)); return parsed; } else @@ -614,7 +334,7 @@ public class SaneTsv endIndex = (i + 1) * splitCount + parseStart; } - parsedValues[i] = ParseParallel(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), startIndex, endIndex); + parsedValues[i] = Parse(inputBuffer, format, headerPropertyInfos.ToArray(), headerTypes.ToArray(), startIndex, endIndex); }); for (int i = 0; i < tasks; i++) @@ -631,7 +351,7 @@ public class SaneTsv // startIndex is in we'd have to go back to the start of the record's comment, and to know // exactly where that comment started we'd have to go back to the start of the record before that // (not including that other record's comment). - protected static T[] ParseParallel(byte[] inputBuffer, FormatType format, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, int startIndex, int endIndex) where T : TsvRecord, new() + protected static T[] Parse(byte[] inputBuffer, FormatType format, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, int startIndex, int endIndex) where T : TsvRecord, new() { var fieldBytes = new List(); var fields = new List(); @@ -985,103 +705,6 @@ public class SaneTsv } public static byte[] SerializeSimpleTsv(IList header, IList> data) - { - var escapedString = new StringBuilder(); - - // Serialize header - for (int i = 0; i < header.Count; i++) - { - if (header[i].Contains(':')) - { - throw new Exception($"Column {i} contains the character ':'"); - } - - for (int j = i + 1; j < header.Count; j++) - { - if (header[i] == header[j]) - { - throw new Exception("Column names in header must be unique"); - } - } - - for (int j = 0; j < header[i].Count(); j++) - { - if (header[i][j] == '\n') - { - escapedString.Append("\\n"); - } - else if (header[i][j] == '\t') - { - escapedString.Append("\\t"); - } - else if (header[i][j] == '\\') - { - escapedString.Append("\\\\"); - } - else if (header[i][j] == '#') - { - escapedString.Append("\\#"); - } - else - { - escapedString.Append(header[i][j]); - } - } - - if (i == header.Count - 1) - { - escapedString.Append('\n'); - } - else - { - escapedString.Append('\t'); - } - } - - // Serialize data - for (int i = 0; i < data.Count; i++) - { - for (int j = 0; j < data[i].Count; j++) - { - for (int k = 0; k < data[i][j].Length; k++) - { - if (data[i][j][k] == '\n') - { - escapedString.Append("\\n"); - } - else if (data[i][j][k] == '\t') - { - escapedString.Append("\\t"); - } - else if (data[i][j][k] == '\\') - { - escapedString.Append("\\\\"); - } - else if (data[i][j][k] == '#') - { - escapedString.Append("\\#"); - } - else - { - escapedString.Append(data[i][j][k]); - } - } - - if (j < data[i].Count - 1) - { - escapedString.Append('\t'); - } - else if (i < data.Count - 1) - { - escapedString.Append('\n'); - } - } - } - - return Encoding.UTF8.GetBytes(escapedString.ToString()); - } - - public static byte[] SerializeSimpleTsvParallel(IList header, IList> data) { var serialized = new List(); var escapedString = new StringBuilder(); @@ -1142,7 +765,7 @@ public class SaneTsv // Complication: it probably depends on processor count if (data.Count < 100) { - serialized.AddRange(Encoding.UTF8.GetBytes(SerializeSimpleTsvParallel(data, 0, data.Count))); + serialized.AddRange(Encoding.UTF8.GetBytes(SerializeSimpleTsv(data, 0, data.Count))); } else { @@ -1160,7 +783,7 @@ public class SaneTsv { endIndex = (i + 1) * splitCount; } - string escapedString = SerializeSimpleTsvParallel(data, i * splitCount, endIndex); + string escapedString = SerializeSimpleTsv(data, i * splitCount, endIndex); bytes[i] = Encoding.UTF8.GetBytes(escapedString); }); @@ -1173,7 +796,7 @@ public class SaneTsv return serialized.ToArray(); } - public static string SerializeSimpleTsvParallel(IList> data, int startIndex, int endIndex) + public static string SerializeSimpleTsv(IList> data, int startIndex, int endIndex) { var escapedString = new StringBuilder(); @@ -1220,159 +843,7 @@ public class SaneTsv return escapedString.ToString(); } - public static (string[] columns, string[][] data) ParseSimpleTsv(byte[] inputBuffer) - { - string[] columnNames = null; - - var fieldBytes = new List(); - var fields = new List(); - var records = new List(); - - int numFields = -1; - int line = 1; - int currentLineStart = 0; - for (int i = 0; i < inputBuffer.Count(); i++) - { - if (inputBuffer[i] == '\\') - { - if (i + 1 == inputBuffer.Count()) - { - throw new Exception($"Found '\\' at end of input"); - } - if (inputBuffer[i + 1] == 'n') - { - fieldBytes.Add((byte)'\n'); - i++; - } - else if (inputBuffer[i + 1] == '\\') - { - fieldBytes.Add((byte)'\\'); - i++; - } - else if (inputBuffer[i + 1] == 't') - { - fieldBytes.Add((byte)'\t'); - i++; - } - else if (inputBuffer[i + 1] == '#') - { - fieldBytes.Add((byte)'#'); - i++; - } - else - { - throw new Exception($"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}"); - } - } - else if (inputBuffer[i] == '\t') - { - // end of field - fields.Add(fieldBytes.ToArray()); - fieldBytes.Clear(); - } - else if (inputBuffer[i] == '\n') - { - fields.Add(fieldBytes.ToArray()); - fieldBytes.Clear(); - - if (numFields < 0) - { - // This is the header - - numFields = fields.Count; - - columnNames = new string[numFields]; - - for (int j = 0; j < fields.Count; j++) - { - string columnString; - try - { - columnString = Encoding.UTF8.GetString(fields[j]); - } - catch (Exception e) - { - throw new Exception($"Column name {fields.Count} is not valid UTF-8", e); - } - - if (columnString.Contains(':')) - { - throw new Exception($"Header {fields.Count} contain ':', which is not allowed for column names"); - } - - columnNames[j] = columnString; - } - - fields.Clear(); - } - else if (numFields != fields.Count) - { - throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); - } - else - { - var fieldStrings = new string[fields.Count]; - for (int j = 0; j < fields.Count; j++) - { - try - { - fieldStrings[j] = Encoding.UTF8.GetString(fields[j]); - } - catch (Exception e) - { - throw new Exception($"Line {line}, column {j} is not valid UTF-8", e); - } - } - records.Add(fieldStrings); - fields.Clear(); - } - - line++; - currentLineStart = i + 1; - } - else if (inputBuffer[i] == '#') - { - throw new Exception($"Found unescaped '#' at line {line}, column {i - currentLineStart}"); - } - else - { - fieldBytes.Add(inputBuffer[i]); - } - } - - fields.Add(fieldBytes.ToArray()); - - if (numFields == 0) - { - throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record"); - } - if (numFields != fields.Count) - { - throw new Exception($"Expected {numFields} fields on line {line}, but found {fields.Count}"); - } - else - { - var fieldStrings = new string[fields.Count]; - for (int j = 0; j < fields.Count; j++) - { - try - { - fieldStrings[j] = Encoding.UTF8.GetString(fields[j]); - } - catch (Exception e) - { - throw new Exception($"Line {line}, column {j} is not valid UTF-8", e); - } - } - records.Add(fieldStrings); - fields.Clear(); - } - - return (columnNames, records.ToArray()); - } - - public static (string[] columns, string[][] data) ParseSimpleTsvParallel(byte[] inputBuffer) { string[] columnNames = null; var headers = new List(); @@ -1459,10 +930,10 @@ public class SaneTsv } } - return (columnNames, ParseSimpleTsvParallel(inputBuffer, columnNames.Length, startOfData, inputBuffer.Length)); + return (columnNames, ParseSimpleTsv(inputBuffer, columnNames.Length, startOfData, inputBuffer.Length)); } - public static string[][] ParseSimpleTsvParallel(byte[] inputBuffer, int numFields, int startIndex, int endIndex) + public static string[][] ParseSimpleTsv(byte[] inputBuffer, int numFields, int startIndex, int endIndex) { var fieldBytes = new List(); var fields = new List(); @@ -1702,282 +1173,7 @@ public class SaneTsv return SerializeTsv(data, FormatType.COMMENTED_TSV); } - protected static byte[] SerializeTsv(IList data, FormatType tsvFormat) where T : TsvRecord - { - var bytes = new List(); - - var headerTypes = new List(); - var headerNames = new List(); - var headerPropertyInfos = new List(); - int columnCount = 0; - - foreach (PropertyInfo property in typeof(T).GetProperties()) - { - TsvColumnAttribute attribute = (TsvColumnAttribute)Attribute.GetCustomAttribute(property, typeof(TsvColumnAttribute)); - if (attribute == null) - { - continue; - } - - string headerName = attribute.ColumnName ?? property.Name; - headerNames.Add(headerName); - Type headerType = attribute.ColumnType ?? GetColumnFromType(property.PropertyType); - if (tsvFormat == FormatType.SIMPLE_TSV && headerType != typeof(StringType)) - { - throw new Exception($"Serializing Simple TSV requires all columns be of type string, but column '{headerName}' has type '{headerType}'"); - } - headerTypes.Add(headerType); - headerPropertyInfos.Add(property); - // TODO: Check that the property type and given column type are compatible - columnCount++; - } - - // Serialize header - for (int i = 0; i < headerNames.Count; i++) - { - for (int j = i + 1; j < headerNames.Count; j++) - { - if (headerNames[i] == headerNames[j]) - { - throw new Exception("Column names in header must be unique"); - } - } - - byte[] nameEncoded = Encoding.UTF8.GetBytes(headerNames[i]); - - for (int j = 0; j < nameEncoded.Length; j++) - { - if (nameEncoded[j] == '\n') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'n'); - } - else if (nameEncoded[j] == '\t') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'t'); - } - else if (nameEncoded[j] == '\\') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'\\'); - } - else if (nameEncoded[j] == '#') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'#'); - } - else - { - bytes.Add(nameEncoded[j]); - } - } - - if (tsvFormat != FormatType.SIMPLE_TSV) - { - bytes.Add((byte)':'); - try - { - bytes.AddRange(Encoding.UTF8.GetBytes(GetNameFromColumn(headerTypes[i]))); - } - catch (Exception e) - { - throw new Exception($"Invalid header type for column {i}", e); - } - } - - if (i == headerNames.Count - 1) - { - bytes.Add((byte)'\n'); - } - else - { - bytes.Add((byte)'\t'); - } - } - - // Serialize data - for (int i = 0; i < data.Count; i++) - { - for (int j = 0; j < columnCount; j++) - { - object datum = headerPropertyInfos[j].GetValue(data[i]); - - try - { - byte[] fieldEncoded = null; - // Some fields definitely don't need escaping, so we add them directly to bytes - bool skipEscaping = false; - - if (headerTypes[j] == typeof(StringType)) - { - fieldEncoded = Encoding.UTF8.GetBytes((string)datum); - } - else if (headerTypes[j] == typeof(BooleanType)) - { - bytes.AddRange((bool)datum ? TrueEncoded : FalseEncoded); - skipEscaping = true; - } - else if (headerTypes[j] == typeof(Float32Type)) - { - if (datum is float f) - { - if (float.IsNegativeInfinity(f)) - { - bytes.AddRange(Encoding.UTF8.GetBytes("-inf")); - } - else if (float.IsPositiveInfinity(f)) - { - bytes.AddRange(Encoding.UTF8.GetBytes("+inf")); - } - else - { - // See https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-numeric-format-strings#round-trip-format-specifier-r - bytes.AddRange(Encoding.UTF8.GetBytes(((float)datum).ToString("G9"))); - } - } - else - { - throw new InvalidCastException(); - } - skipEscaping = true; - } - else if (headerTypes[j] == typeof(Float32LEType)) - { - if (LittleEndian) - { - fieldEncoded = BitConverter.GetBytes((float)datum); - } - else - { - byte[] floatBytes = BitConverter.GetBytes((float)datum); - fieldEncoded = new byte[sizeof(float)]; - for (int k = 0; k < sizeof(float); k++) - { - fieldEncoded[k] = floatBytes[sizeof(float) - 1 - k]; - } - } - } - else if (headerTypes[j] == typeof(Float64Type)) - { - if (datum is double d) - { - if (double.IsNegativeInfinity(d)) - { - bytes.AddRange(Encoding.UTF8.GetBytes("-inf")); - } - else if (double.IsPositiveInfinity(d)) - { - bytes.AddRange(Encoding.UTF8.GetBytes("+inf")); - } - else - { - // See https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-numeric-format-strings#round-trip-format-specifier-r - bytes.AddRange(Encoding.UTF8.GetBytes((d).ToString("G17"))); - } - } - else - { - throw new InvalidCastException(); - } - skipEscaping = true; - } - else if (headerTypes[j] == typeof(Float64LEType)) - { - if (LittleEndian) - { - fieldEncoded = BitConverter.GetBytes((double)datum); - } - else - { - byte[] doubleBytes = BitConverter.GetBytes((double)datum); - fieldEncoded = new byte[sizeof(double)]; - for (int k = 0; k < sizeof(double); k++) - { - fieldEncoded[k] = doubleBytes[sizeof(double) - 1 - k]; - } - } - } - else if (headerTypes[j] == typeof(UInt32Type)) - { - bytes.AddRange(Encoding.UTF8.GetBytes(((UInt32)datum).ToString())); - skipEscaping = true; - } - else if (headerTypes[j] == typeof(UInt64Type)) - { - bytes.AddRange(Encoding.UTF8.GetBytes(((UInt64)datum).ToString())); - skipEscaping = true; - } - else if (headerTypes[j] == typeof(Int32Type)) - { - bytes.AddRange(Encoding.UTF8.GetBytes(((Int32)datum).ToString())); - skipEscaping = true; - } - else if (headerTypes[j] == typeof(Int64Type)) - { - bytes.AddRange(Encoding.UTF8.GetBytes(((Int64)datum).ToString())); - skipEscaping = true; - } - else if (headerTypes[j] == typeof(BinaryType)) - { - fieldEncoded = (byte[])datum; - } - else - { - throw new Exception($"Unexpected column type {headerTypes[j]} for column {j}"); - } - - if (!skipEscaping) - { - for (int k = 0; k < fieldEncoded.Length; k++) - { - if (fieldEncoded[k] == '\n') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'n'); - } - else if (fieldEncoded[k] == '\t') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'t'); - } - else if (fieldEncoded[k] == '\\') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'\\'); - } - else if (fieldEncoded[k] == '#') - { - bytes.Add((byte)'\\'); - bytes.Add((byte)'#'); - } - else - { - bytes.Add(fieldEncoded[k]); - } - } - } - - if (j < columnCount - 1) - { - bytes.Add((byte)'\t'); - } - else if (i < data.Count - 1) - { - bytes.Add((byte)'\n'); - } - } - catch (InvalidCastException e) - { - throw new Exception($"Record {i}, field {j} expected type compatible with {GetNameFromColumn(headerTypes[j])}", e); - } - } - } - - return bytes.ToArray(); - } - - protected static byte[] SerializeTsvParallel(IList data, FormatType tsvFormat) + protected static byte[] SerializeTsv(IList data, FormatType tsvFormat) { var bytes = new List(); @@ -2073,12 +1269,12 @@ public class SaneTsv } // Serialize data - SerializeTsvParallel(data, bytes, headerPropertyInfos.ToArray(), headerTypes.ToArray(), tsvFormat, 0, data.Count); + SerializeTsv(data, bytes, headerPropertyInfos.ToArray(), headerTypes.ToArray(), tsvFormat, 0, data.Count); return bytes.ToArray(); } - protected static void SerializeTsvParallel(IList data, List bytes, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, FormatType tsvFormat, int startIndex, int endIndex) + protected static void SerializeTsv(IList data, List bytes, PropertyInfo[] headerPropertyInfos, Type[] headerTypes, FormatType tsvFormat, int startIndex, int endIndex) { // Serialize data for (int i = 0; i < data.Count; i++) diff --git a/SaneTsvTest/Program.cs b/SaneTsvTest/Program.cs index 0c835f8..33aae0a 100644 --- a/SaneTsvTest/Program.cs +++ b/SaneTsvTest/Program.cs @@ -348,222 +348,6 @@ internal class Program : SaneTsv Console.WriteLine($"Unspecced parse time: {unspeccedParseTime}"); } - { - string testName = "Check parallel Simple TSV serialization"; - - int N = 100000; - var records = new StringTestRecord[N]; - var rand = new Random(1); - - for (int i = 0; i < N; i++) - { - records[i] = new StringTestRecord() - { - Column1 = rand.Next().ToString(), - column2 = rand.Next().ToString(), - Column3 = rand.Next().ToString(), - }; - } - - string[][] recordStrings = records.Select(record => new string[] { record.Column1, record.column2, record.Column3 }).ToArray(); - - DateTime lastTime = DateTime.Now; - byte[] serialized1 = SaneTsv.SerializeSimpleTsv(new string[] { "column1", "column2", "columnthree\nyep" }, recordStrings); - TimeSpan unparallelTime = DateTime.Now - lastTime; - lastTime = DateTime.Now; - byte[] serialized2 = SaneTsv.SerializeSimpleTsvParallel(new string[] { "column1", "column2", "columnthree\nyep" }, recordStrings); - TimeSpan parallelTime = DateTime.Now - lastTime; - - Console.WriteLine($"Unparallel serialization time: {unparallelTime}"); - Console.WriteLine($"Parallel serialization time: {parallelTime}"); - - bool matching = true; - for (int i = 0; i < Math.Min(serialized1.Length, serialized2.Length); i++) - { - if (serialized1[i] != serialized2[i]) - { - matching = false; - break; - } - } - - if (matching) - { - Console.WriteLine($"Passed {testName}"); - } - else - { - Console.WriteLine($"Failed {testName}"); - } - } - - { - string testName = "Check Simple TSV parallel parsing"; - - int N = 100000; - var records = new StringTestRecord[N]; - var rand = new Random(1); - - for (int i = 0; i < N; i++) - { - records[i] = new StringTestRecord() - { - Column1 = rand.Next().ToString(), - column2 = rand.Next().ToString(), - Column3 = rand.Next().ToString(), - }; - } - - byte[] serialized = SaneTsv.SerializeSimpleTsv(records); - - DateTime lastTime = DateTime.Now; - (string[] headers2, string[][] data2) = SaneTsv.ParseSimpleTsv(serialized); - TimeSpan unparallelTime = DateTime.Now - lastTime; - lastTime = DateTime.Now; - (string[] headers, string[][] data) = SaneTsv.ParseSimpleTsvParallel(serialized); - TimeSpan parallelTime = DateTime.Now - lastTime; - - Console.WriteLine($"Unparallel parse time: {unparallelTime}"); - Console.WriteLine($"Parallel parse time: {parallelTime}"); - - bool matching = true; - for (int j = 0; j < Math.Min(headers2.Length, headers.Length); j++) - { - if (headers[j] != headers2[j]) - { - matching = false; - break; - } - } - - for (int i = 0; i < Math.Min(data.Length, data2.Length) && matching; i++) - { - for (int j = 0; j < data[0].Length; j++) - { - if (data[i][j] != data2[i][j]) - { - matching = false; - break; - } - } - } - - if (matching) - { - Console.WriteLine($"Passed {testName}"); - } - else - { - Console.WriteLine($"Failed {testName}"); - } - } - - { - string testName = "Check parallel serialization"; - - int N = 1000; - var records = new BoolTestRecord[N]; - var rand = new Random(1); - - for (int i = 0; i < N; i++) - { - byte[] bytes = new byte[rand.Next(50)]; - rand.NextBytes(bytes); - records[i] = new BoolTestRecord() - { - Column1 = rand.NextDouble() > 0.5, - column2 = bytes, - Column3 = rand.Next().ToString(), - }; - } - - DateTime lastTime = DateTime.Now; - byte[] serialized1 = SaneTsv.SerializeTsv(records, FormatType.COMMENTED_TSV); - TimeSpan unparallelTime = DateTime.Now - lastTime; - lastTime = DateTime.Now; - byte[] serialized2 = SaneTsv.SerializeTsvParallel(records, FormatType.COMMENTED_TSV); - TimeSpan parallelTime = DateTime.Now - lastTime; - - Console.WriteLine($"Unparallel serialization time: {unparallelTime}"); - Console.WriteLine($"Parallel serialization time: {parallelTime}"); - - bool matching = true; - for (int i = 0; i < Math.Min(serialized1.Length, serialized2.Length); i++) - { - if (serialized1[i] != serialized2[i]) - { - matching = false; - break; - } - } - - if (matching) - { - Console.WriteLine($"Passed {testName}"); - } - else - { - Console.WriteLine($"Failed {testName}"); - } - } - - { - string testName = "Check parallel parsing"; - - int N = 1000000; - var records = new BoolTestRecord[N]; - var rand = new Random(1); - - for (int i = 0; i < N; i++) - { - byte[] bytes = new byte[rand.Next(50)]; - rand.NextBytes(bytes); - records[i] = new BoolTestRecord() - { - Column1 = rand.NextDouble() > 0.5, - column2 = bytes, - Column3 = rand.Next().ToString(), - }; - } - - byte[] serialized2 = SaneTsv.SerializeTsvParallel(records, FormatType.COMMENTED_TSV); - - DateTime lastTime = DateTime.Now; - CommentedTsv parsed = (CommentedTsv)SaneTsv.Parse(serialized2, FormatType.COMMENTED_TSV); - TimeSpan unparallelTime = DateTime.Now - lastTime; - lastTime = DateTime.Now; - CommentedTsv parsed2 = (CommentedTsv)SaneTsv.ParseParallel(serialized2, FormatType.COMMENTED_TSV); - TimeSpan parallelTime = DateTime.Now - lastTime; - - Console.WriteLine($"Unparallel parsing time: {unparallelTime}"); - Console.WriteLine($"Parallel parsing time: {parallelTime}"); - - bool matching = parsed.FileComment == parsed2.FileComment; - - matching &= parsed.Records.Count == parsed2.Records.Count; - - for (int i = 0; matching && i < parsed.Records.Count; i++) - { - matching &= parsed.Records[i].Comment == parsed2.Records[i].Comment; - matching &= parsed.Records[i].Column1 == parsed2.Records[i].Column1; - matching &= parsed.Records[i].column2.Length == parsed2.Records[i].column2.Length; - for (int j = 0; matching && j < parsed.Records[i].column2.Length; j++) - { - matching &= parsed.Records[i].column2[j] == parsed2.Records[i].column2[j]; - } - } - - if (matching) - { - Console.WriteLine($"Passed {testName}"); - } - else - { - Console.WriteLine($"Failed {testName}"); - } - } - - Console.WriteLine("Done with tests"); } }