Compare commits

...

2 Commits

Author SHA1 Message Date
Nathan McRae
78eaa5dbab Start parallel versions of general TSV serialization/parsing
They mostly work, but are not actually parallelized yet and likely have some edge cases.
Also, the soon-to-be parallel version of parsing is very slow compared to the original.
2024-02-25 22:35:56 -08:00
Nathan McRae
4ddb8dc44d Add parallel parsing/serialization for Simple TSV
Doesn't give as much of a performance bonus as hoped
2024-02-25 11:24:30 -08:00
2 changed files with 1328 additions and 11 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
using NathanMcRae;
using System.Text;
internal class Program
internal class Program : SaneTsv
{
public class TestRecord : SaneTsv.TsvRecord
{
@@ -101,6 +101,18 @@ internal class Program
public double BinFloat { get; set; }
}
public class StringTestRecord : SaneTsv.TsvRecord
{
[SaneTsv.TypedTsvColumn("column1")]
public string Column1 { get; set; }
[SaneTsv.TypedTsvColumn]
public string column2 { get; set; }
[SaneTsv.TypedTsvColumn("columnthree\nyep")]
public string Column3 { get; set; }
}
private static void Main(string[] args)
{
{
@@ -292,6 +304,266 @@ internal class Program
}
}
{
string testName = "Timing comparison of simple parse methods and comparison of simple serialization methods";
int N = 1000000;
var records = new StringTestRecord[N];
var rand = new Random(1);
for (int i = 0; i < N; i++)
{
records[i] = new StringTestRecord()
{
Column1 = rand.Next().ToString(),
column2 = rand.Next().ToString(),
Column3 = rand.Next().ToString(),
};
}
string[][] recordStrings = records.Select(record => new string[] { record.Column1, record.column2, record.Column3 }).ToArray();
DateTime lastTime = DateTime.Now;
byte[] serialized1 = SaneTsv.SerializeSimpleTsv<StringTestRecord>(records);
TimeSpan speccedSerializationTime = DateTime.Now - lastTime;
Console.WriteLine($"Specced serialization time: {speccedSerializationTime}");
lastTime = DateTime.Now;
byte[] serialized2 = SaneTsv.SerializeSimpleTsv(new string[] { "column1", "column2", "columnthree\nyep" }, recordStrings);
TimeSpan unspeccedSerializationTime = DateTime.Now - lastTime;
Console.WriteLine($"Unspecced serialization time: {unspeccedSerializationTime}");
lastTime = DateTime.Now;
Tsv<StringTestRecord> parsed = SaneTsv.ParseSimpleTsv<StringTestRecord>(serialized1);
TimeSpan speccedParseTime = DateTime.Now - lastTime;
Console.WriteLine($"Specced parse time: {speccedParseTime}");
lastTime = DateTime.Now;
(string[] columns, string[][] data) = SaneTsv.ParseSimpleTsv(serialized2);
TimeSpan unspeccedParseTime = DateTime.Now - lastTime;
Console.WriteLine($"Unspecced parse time: {unspeccedParseTime}");
}
{
string testName = "Check parallel Simple TSV serialization";
int N = 100000;
var records = new StringTestRecord[N];
var rand = new Random(1);
for (int i = 0; i < N; i++)
{
records[i] = new StringTestRecord()
{
Column1 = rand.Next().ToString(),
column2 = rand.Next().ToString(),
Column3 = rand.Next().ToString(),
};
}
string[][] recordStrings = records.Select(record => new string[] { record.Column1, record.column2, record.Column3 }).ToArray();
DateTime lastTime = DateTime.Now;
byte[] serialized1 = SaneTsv.SerializeSimpleTsv(new string[] { "column1", "column2", "columnthree\nyep" }, recordStrings);
TimeSpan unparallelTime = DateTime.Now - lastTime;
lastTime = DateTime.Now;
byte[] serialized2 = SaneTsv.SerializeSimpleTsvParallel(new string[] { "column1", "column2", "columnthree\nyep" }, recordStrings);
TimeSpan parallelTime = DateTime.Now - lastTime;
Console.WriteLine($"Unparallel serialization time: {unparallelTime}");
Console.WriteLine($"Parallel serialization time: {parallelTime}");
bool matching = true;
for (int i = 0; i < Math.Min(serialized1.Length, serialized2.Length); i++)
{
if (serialized1[i] != serialized2[i])
{
matching = false;
break;
}
}
if (matching)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
{
string testName = "Check Simple TSV parallel parsing";
int N = 100000;
var records = new StringTestRecord[N];
var rand = new Random(1);
for (int i = 0; i < N; i++)
{
records[i] = new StringTestRecord()
{
Column1 = rand.Next().ToString(),
column2 = rand.Next().ToString(),
Column3 = rand.Next().ToString(),
};
}
byte[] serialized = SaneTsv.SerializeSimpleTsv<StringTestRecord>(records);
DateTime lastTime = DateTime.Now;
(string[] headers2, string[][] data2) = SaneTsv.ParseSimpleTsv(serialized);
TimeSpan unparallelTime = DateTime.Now - lastTime;
lastTime = DateTime.Now;
(string[] headers, string[][] data) = SaneTsv.ParseSimpleTsvParallel(serialized);
TimeSpan parallelTime = DateTime.Now - lastTime;
Console.WriteLine($"Unparallel parse time: {unparallelTime}");
Console.WriteLine($"Parallel parse time: {parallelTime}");
bool matching = true;
for (int j = 0; j < Math.Min(headers2.Length, headers.Length); j++)
{
if (headers[j] != headers2[j])
{
matching = false;
break;
}
}
for (int i = 0; i < Math.Min(data.Length, data2.Length) && matching; i++)
{
for (int j = 0; j < data[0].Length; j++)
{
if (data[i][j] != data2[i][j])
{
matching = false;
break;
}
}
}
if (matching)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
{
string testName = "Check parallel serialization";
int N = 1000;
var records = new BoolTestRecord[N];
var rand = new Random(1);
for (int i = 0; i < N; i++)
{
byte[] bytes = new byte[rand.Next(50)];
rand.NextBytes(bytes);
records[i] = new BoolTestRecord()
{
Column1 = rand.NextDouble() > 0.5,
column2 = bytes,
Column3 = rand.Next().ToString(),
};
}
DateTime lastTime = DateTime.Now;
byte[] serialized1 = SaneTsv.SerializeTsv<BoolTestRecord>(records, FormatType.COMMENTED_TSV);
TimeSpan unparallelTime = DateTime.Now - lastTime;
lastTime = DateTime.Now;
byte[] serialized2 = SaneTsv.SerializeTsvParallel<BoolTestRecord>(records, FormatType.COMMENTED_TSV);
TimeSpan parallelTime = DateTime.Now - lastTime;
Console.WriteLine($"Unparallel serialization time: {unparallelTime}");
Console.WriteLine($"Parallel serialization time: {parallelTime}");
bool matching = true;
for (int i = 0; i < Math.Min(serialized1.Length, serialized2.Length); i++)
{
if (serialized1[i] != serialized2[i])
{
matching = false;
break;
}
}
if (matching)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
{
string testName = "Check parallel parsing";
int N = 1000;
var records = new BoolTestRecord[N];
var rand = new Random(1);
for (int i = 0; i < N; i++)
{
byte[] bytes = new byte[rand.Next(50)];
rand.NextBytes(bytes);
records[i] = new BoolTestRecord()
{
Column1 = rand.NextDouble() > 0.5,
column2 = bytes,
Column3 = rand.Next().ToString(),
};
}
byte[] serialized2 = SaneTsv.SerializeTsvParallel<BoolTestRecord>(records, FormatType.COMMENTED_TSV);
DateTime lastTime = DateTime.Now;
CommentedTsv<BoolTestRecord> parsed = (CommentedTsv<BoolTestRecord>)SaneTsv.Parse<BoolTestRecord>(serialized2, FormatType.COMMENTED_TSV);
TimeSpan unparallelTime = DateTime.Now - lastTime;
lastTime = DateTime.Now;
CommentedTsv<BoolTestRecord> parsed2 = (CommentedTsv<BoolTestRecord>)SaneTsv.ParseParallel<BoolTestRecord>(serialized2, FormatType.COMMENTED_TSV);
TimeSpan parallelTime = DateTime.Now - lastTime;
Console.WriteLine($"Unparallel parsing time: {unparallelTime}");
Console.WriteLine($"Parallel parsing time: {parallelTime}");
bool matching = parsed.FileComment == parsed2.FileComment;
matching &= parsed.Records.Count == parsed2.Records.Count;
for (int i = 0; matching && i < parsed.Records.Count; i++)
{
matching &= parsed.Records[i].Comment == parsed2.Records[i].Comment;
matching &= parsed.Records[i].Column1 == parsed2.Records[i].Column1;
matching &= parsed.Records[i].column2.Length == parsed2.Records[i].column2.Length;
for (int j = 0; matching && j < parsed.Records[i].column2.Length; j++)
{
matching &= parsed.Records[i].column2[j] == parsed2.Records[i].column2[j];
}
}
if (matching)
{
Console.WriteLine($"Passed {testName}");
}
else
{
Console.WriteLine($"Failed {testName}");
}
}
Console.WriteLine("Done with tests");
}
}