Add type parsing
This commit is contained in:
		
							
								
								
									
										267
									
								
								SaneTsv.cs
									
									
									
									
									
								
							
							
						
						
									
										267
									
								
								SaneTsv.cs
									
									
									
									
									
								
							@@ -7,20 +7,37 @@ namespace NathanMcRae;
 | 
			
		||||
/// </summary>
 | 
			
		||||
public class SaneTsv
 | 
			
		||||
{
 | 
			
		||||
  public enum ColumnType
 | 
			
		||||
  {
 | 
			
		||||
    STRING,
 | 
			
		||||
    BOOLEAN,
 | 
			
		||||
    FLOAT32,
 | 
			
		||||
    FLOAT64,
 | 
			
		||||
    UINT32,
 | 
			
		||||
    UINT64,
 | 
			
		||||
    INT32,
 | 
			
		||||
    INT64,
 | 
			
		||||
    BINARY,
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // TODO: We need to be able to update all these in tandem somehow
 | 
			
		||||
  public string[] ColumnNames { get; protected set; }
 | 
			
		||||
  public Dictionary<string, List<string>> Columns { get; protected set; }
 | 
			
		||||
  public ColumnType[] ColumnTypes { get; protected set; }
 | 
			
		||||
  public Dictionary<string, List<object>> Columns { get; protected set; }
 | 
			
		||||
  public List<SaneTsvRecord> Records { get; protected set; }
 | 
			
		||||
 | 
			
		||||
  // TODO: Parse with specified columns / types
 | 
			
		||||
 | 
			
		||||
  public static SaneTsv Parse(byte[] inputBuffer)
 | 
			
		||||
  {
 | 
			
		||||
    var parsed = new SaneTsv();
 | 
			
		||||
    parsed.Columns = new Dictionary<string, List<string>>();
 | 
			
		||||
    parsed.Columns = new Dictionary<string, List<object>>();
 | 
			
		||||
    parsed.ColumnNames = new string[] { };
 | 
			
		||||
    parsed.ColumnTypes = new ColumnType[] { };
 | 
			
		||||
    parsed.Records = new List<SaneTsvRecord>();
 | 
			
		||||
 | 
			
		||||
    var fieldBytes = new List<byte>();
 | 
			
		||||
    var fields = new List<string>();
 | 
			
		||||
    var fields = new List<byte[]>();
 | 
			
		||||
    int numFields = -1;
 | 
			
		||||
    for (int i = 0; i < inputBuffer.Count(); i++)
 | 
			
		||||
    {
 | 
			
		||||
@@ -53,26 +70,12 @@ public class SaneTsv
 | 
			
		||||
      else if (inputBuffer[i] == '\t')
 | 
			
		||||
      {
 | 
			
		||||
        // end of field
 | 
			
		||||
        try
 | 
			
		||||
        {
 | 
			
		||||
          fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
 | 
			
		||||
        }
 | 
			
		||||
        catch (Exception e)
 | 
			
		||||
        {
 | 
			
		||||
          throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
 | 
			
		||||
        }
 | 
			
		||||
        fields.Add(fieldBytes.ToArray());
 | 
			
		||||
        fieldBytes.Clear();
 | 
			
		||||
      }
 | 
			
		||||
      else if (inputBuffer[i] == '\n')
 | 
			
		||||
      {
 | 
			
		||||
        try
 | 
			
		||||
        {
 | 
			
		||||
          fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
 | 
			
		||||
        }
 | 
			
		||||
        catch (Exception e)
 | 
			
		||||
        {
 | 
			
		||||
          throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
 | 
			
		||||
        }
 | 
			
		||||
        fields.Add(fieldBytes.ToArray());
 | 
			
		||||
        fieldBytes.Clear();
 | 
			
		||||
 | 
			
		||||
        if (numFields < 0)
 | 
			
		||||
@@ -82,14 +85,76 @@ public class SaneTsv
 | 
			
		||||
          numFields = fields.Count;
 | 
			
		||||
 | 
			
		||||
          parsed.ColumnNames = new string[numFields];
 | 
			
		||||
          parsed.ColumnTypes = new ColumnType[numFields];
 | 
			
		||||
 | 
			
		||||
          int numTypesBlank = 0;
 | 
			
		||||
 | 
			
		||||
          for (int j = 0; j < fields.Count; j++)
 | 
			
		||||
          {
 | 
			
		||||
            string columnName = fields[j];
 | 
			
		||||
            string columnString;
 | 
			
		||||
            try
 | 
			
		||||
            {
 | 
			
		||||
              columnString = Encoding.UTF8.GetString(fields[j]);
 | 
			
		||||
            }
 | 
			
		||||
            catch (Exception e)
 | 
			
		||||
            {
 | 
			
		||||
              throw new Exception($"Header {fields.Count} is not valid UTF-8", e);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            string columnTypeString;
 | 
			
		||||
            string columnName;
 | 
			
		||||
            if (columnString.Contains(":")) {
 | 
			
		||||
              columnTypeString = columnString.Split(":").Last();
 | 
			
		||||
              columnName = columnString.Substring(0, columnString.Length - columnTypeString.Length - 1);
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
              columnTypeString = "";
 | 
			
		||||
              columnName = columnString;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            ColumnType type;
 | 
			
		||||
 | 
			
		||||
            switch (columnTypeString)
 | 
			
		||||
            {
 | 
			
		||||
              case "":
 | 
			
		||||
                numTypesBlank++;
 | 
			
		||||
                type = ColumnType.STRING;
 | 
			
		||||
                break;
 | 
			
		||||
              case "string":
 | 
			
		||||
                type = ColumnType.STRING;
 | 
			
		||||
                break;
 | 
			
		||||
              case "boolean":
 | 
			
		||||
                type = ColumnType.BOOLEAN;
 | 
			
		||||
                break;
 | 
			
		||||
              case "float32":
 | 
			
		||||
                type = ColumnType.FLOAT32;
 | 
			
		||||
                break;
 | 
			
		||||
              case "float64":
 | 
			
		||||
                type = ColumnType.FLOAT64;
 | 
			
		||||
                break;
 | 
			
		||||
              case "uint32":
 | 
			
		||||
                type = ColumnType.UINT32;
 | 
			
		||||
                break;
 | 
			
		||||
              case "uint64":
 | 
			
		||||
                type = ColumnType.UINT64;
 | 
			
		||||
                break;
 | 
			
		||||
              case "int32":
 | 
			
		||||
                type = ColumnType.INT32;
 | 
			
		||||
                break;
 | 
			
		||||
              case "int64":
 | 
			
		||||
                type = ColumnType.INT64;
 | 
			
		||||
                break;
 | 
			
		||||
              case "binary":
 | 
			
		||||
                type = ColumnType.BINARY;
 | 
			
		||||
                break;
 | 
			
		||||
              default:
 | 
			
		||||
                throw new Exception($"Invalid type '{columnTypeString}' for column {j}");
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            try
 | 
			
		||||
            {
 | 
			
		||||
              parsed.Columns.Add(columnName, new List<string>());
 | 
			
		||||
              parsed.Columns.Add(columnName, new List<object>());
 | 
			
		||||
            }
 | 
			
		||||
            catch (Exception e)
 | 
			
		||||
            {
 | 
			
		||||
@@ -97,6 +162,12 @@ public class SaneTsv
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            parsed.ColumnNames[j] = columnName;
 | 
			
		||||
            parsed.ColumnTypes[j] = type;
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if (numTypesBlank != 0 && numTypesBlank != fields.Count)
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception("Types must be provided for all columns or none. Use 'string' for columns missing types.");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          fields.Clear();
 | 
			
		||||
@@ -107,13 +178,7 @@ public class SaneTsv
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          for (int j = 0; j < fields.Count; j++)
 | 
			
		||||
          {
 | 
			
		||||
              parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray()));
 | 
			
		||||
          fields.Clear();
 | 
			
		||||
          AddRecord(parsed, fields);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      else
 | 
			
		||||
@@ -122,50 +187,146 @@ public class SaneTsv
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    try
 | 
			
		||||
    {
 | 
			
		||||
      fields.Add(Encoding.UTF8.GetString(fieldBytes.ToArray()));
 | 
			
		||||
    }
 | 
			
		||||
    catch (Exception e)
 | 
			
		||||
    {
 | 
			
		||||
      throw new Exception($"Field {fields.Count} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
 | 
			
		||||
    }
 | 
			
		||||
    fields.Add(fieldBytes.ToArray());
 | 
			
		||||
 | 
			
		||||
    if (numFields == 0)
 | 
			
		||||
    {
 | 
			
		||||
      throw new Exception("Found 0 fields on last line. Possibly because of extra \\n after last record");
 | 
			
		||||
    }
 | 
			
		||||
    if (numFields != fields.Count)
 | 
			
		||||
    {
 | 
			
		||||
      throw new Exception($"Expected {numFields} fields on line {parsed.Records.Count + 2}, but found {fields.Count}");
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      for (int j = 0; j < fields.Count; j++)
 | 
			
		||||
      {
 | 
			
		||||
        try
 | 
			
		||||
        {
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
 | 
			
		||||
        }
 | 
			
		||||
        catch (Exception e)
 | 
			
		||||
        {
 | 
			
		||||
          throw new Exception($"Field {j} on line {parsed.Records.Count + 1} is not valid UTF-8", e);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      parsed.Records.Add(new SaneTsvRecord(parsed, fields.ToArray()));
 | 
			
		||||
      fields.Clear();
 | 
			
		||||
      AddRecord(parsed, fields);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return parsed;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  protected static void AddRecord(SaneTsv parsed, List<byte[]> fields)
 | 
			
		||||
  {
 | 
			
		||||
    var parsedFields = new object[fields.Count];
 | 
			
		||||
    for (int j = 0; j < fields.Count; j++)
 | 
			
		||||
    {
 | 
			
		||||
      // All other types require the content to be UTF-8. Binary fields can ignore that.
 | 
			
		||||
      if (parsed.ColumnTypes[j] == ColumnType.BINARY)
 | 
			
		||||
      {
 | 
			
		||||
        parsedFields[j] = fields[j];
 | 
			
		||||
        parsed.Columns[parsed.ColumnNames[j]].Add(fields[j]);
 | 
			
		||||
        continue;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      string fieldString;
 | 
			
		||||
      try
 | 
			
		||||
      {
 | 
			
		||||
        fieldString = Encoding.UTF8.GetString(fields[j]);
 | 
			
		||||
      }
 | 
			
		||||
      catch (Exception e)
 | 
			
		||||
      {
 | 
			
		||||
        throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UTF-8", e);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      switch (parsed.ColumnTypes[j])
 | 
			
		||||
      {
 | 
			
		||||
        case ColumnType.STRING:
 | 
			
		||||
          parsedFields[j] = fieldString;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(fieldString);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.BOOLEAN:
 | 
			
		||||
          bool parsedBool;
 | 
			
		||||
          if (fieldString == "TRUE")
 | 
			
		||||
          {
 | 
			
		||||
            parsedBool = true;
 | 
			
		||||
          }
 | 
			
		||||
          else if (fieldString == "FALSE")
 | 
			
		||||
          {
 | 
			
		||||
            parsedBool = false;
 | 
			
		||||
          }
 | 
			
		||||
          else
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedBool;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedBool);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.FLOAT32:
 | 
			
		||||
          if (!float.TryParse(fieldString, out float parsedFloat))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid single-precision float");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedFloat;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedFloat);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.FLOAT64:
 | 
			
		||||
          if (!double.TryParse(fieldString, out double parsedDouble))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid double-precision float");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedDouble;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedDouble);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.UINT32:
 | 
			
		||||
          if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt32");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedUInt32;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt32);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.UINT64:
 | 
			
		||||
          if (!UInt64.TryParse(fieldString, out UInt64 parsedUInt64))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid UInt64");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedUInt64;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedUInt64);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.INT32:
 | 
			
		||||
          if (!Int32.TryParse(fieldString, out Int32 parsedInt32))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int32");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedInt32;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt32);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.INT64:
 | 
			
		||||
          if (!Int64.TryParse(fieldString, out Int64 parsedInt64))
 | 
			
		||||
          {
 | 
			
		||||
            throw new Exception($"Field {j} on line {parsed.Records.Count + 2} is not valid Int64");
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          parsedFields[j] = parsedInt64;
 | 
			
		||||
          parsed.Columns[parsed.ColumnNames[j]].Add(parsedInt64);
 | 
			
		||||
          break;
 | 
			
		||||
        case ColumnType.BINARY:
 | 
			
		||||
          throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
 | 
			
		||||
        default:
 | 
			
		||||
          throw new Exception($"Unexpected type {parsed.ColumnTypes[j]}");
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    parsed.Records.Add(new SaneTsvRecord(parsed, parsedFields));
 | 
			
		||||
    fields.Clear();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  public SaneTsvRecord this[int i] => Records[i];
 | 
			
		||||
 | 
			
		||||
  public class SaneTsvRecord
 | 
			
		||||
  {
 | 
			
		||||
    public SaneTsv Parent { get; }
 | 
			
		||||
    public string[] Fields { get; }
 | 
			
		||||
    public object[] Fields { get; }
 | 
			
		||||
 | 
			
		||||
    public string this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
 | 
			
		||||
    public object this[string columnName] => Fields[Array.IndexOf(Parent.ColumnNames, columnName)];
 | 
			
		||||
 | 
			
		||||
    public SaneTsvRecord(SaneTsv parent, string[] fields)
 | 
			
		||||
    public SaneTsvRecord(SaneTsv parent, object[] fields)
 | 
			
		||||
    {
 | 
			
		||||
      Parent = parent;
 | 
			
		||||
      Fields = fields;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,38 @@
 | 
			
		||||
using NathanMcRae;
 | 
			
		||||
using System.Text;
 | 
			
		||||
 | 
			
		||||
string testString1 = "column1\tcolumn2\tcolumnthree\\nyep\nvalue1\tvalue\\\\twoo\tvaluetrhee\nthis\\nis\\na\\nvalue\tnother\tno\\ther";
 | 
			
		||||
{
 | 
			
		||||
  string testName = "Bool test";
 | 
			
		||||
  string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
 | 
			
		||||
    "\nTRUE\tvalue\\\\t\0woo\tvaluetrhee" +
 | 
			
		||||
    "\nFALSE\tnother\tno\\ther";
 | 
			
		||||
 | 
			
		||||
SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
 | 
			
		||||
// See https://aka.ms/new-console-template for more information
 | 
			
		||||
Console.WriteLine("Hello, World!");
 | 
			
		||||
  SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
 | 
			
		||||
  if (parsed.Records[0]["column1:type"] is bool result && result)
 | 
			
		||||
  {
 | 
			
		||||
    Console.WriteLine($"Passed {testName}");
 | 
			
		||||
  }
 | 
			
		||||
  else
 | 
			
		||||
  {
 | 
			
		||||
    Console.WriteLine($"Failed {testName}");
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
{
 | 
			
		||||
  string testName = "Bad bool test";
 | 
			
		||||
  try
 | 
			
		||||
  {
 | 
			
		||||
    string testString1 = "column1:type:boolean\tcolumn2:binary\tcolumnthree\\nyep:string" +
 | 
			
		||||
      "\nTUE\tvalue\\\\t\0woo\tvaluetrhee" +
 | 
			
		||||
      "\nFALSE\tnother\tno\\ther";
 | 
			
		||||
 | 
			
		||||
    SaneTsv parsed = SaneTsv.Parse(Encoding.UTF8.GetBytes(testString1));
 | 
			
		||||
    Console.WriteLine($"Failed {testName}");
 | 
			
		||||
  }
 | 
			
		||||
  catch (Exception)
 | 
			
		||||
  {
 | 
			
		||||
    Console.WriteLine($"Passed {testName}");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Console.WriteLine("Done with tests");
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user