2024-03-11 05:43:11 +00:00
using System.Globalization ;
using System.Reflection ;
2024-02-20 22:30:01 +00:00
using System.Text ;
2024-03-11 05:43:11 +00:00
using System.Text.RegularExpressions ;
2024-03-16 01:58:07 +00:00
using UnitsNet ;
using UnitsNet.Units ;
2024-02-14 03:15:07 +00:00
namespace NathanMcRae ;
2024-02-23 06:09:13 +00:00
public class Tsv < T > where T : SaneTsv . TsvRecord
2024-02-20 22:30:01 +00:00
{
2024-02-23 06:09:13 +00:00
public virtual List < T > Records { get ; set ; }
}
2024-02-23 07:16:35 +00:00
public class CommentedTsv < T > : Tsv < T > where T : SaneTsv . TsvRecord
2024-02-23 06:09:13 +00:00
{
public override List < T > Records { get ; set ; }
2024-02-20 22:30:01 +00:00
public string FileComment { get ; set ; } = null ;
}
2024-02-14 03:15:07 +00:00
/// <summary>
/// Sane Tab-Separated Values
/// </summary>
public class SaneTsv
{
2024-02-16 04:22:43 +00:00
// Like an enum, but more extensible
public class ColumnType { }
public class StringType : ColumnType { }
public class BooleanType : ColumnType { }
public class Float32Type : ColumnType { }
2024-02-17 01:20:11 +00:00
public class Float32LEType : ColumnType { }
2024-02-16 04:22:43 +00:00
public class Float64Type : ColumnType { }
2024-02-17 01:20:11 +00:00
public class Float64LEType : ColumnType { }
2024-02-16 04:22:43 +00:00
public class UInt32Type : ColumnType { }
public class UInt64Type : ColumnType { }
public class Int32Type : ColumnType { }
public class Int64Type : ColumnType { }
public class BinaryType : ColumnType { }
2024-03-11 05:43:11 +00:00
public class Iso8601Type : ColumnType { }
public class PhysicalUnitsType : ColumnType
{
2024-03-16 01:58:07 +00:00
public UnitsNet . UnitInfo Units { get ; }
public ColumnType BaseType { get ; internal set ; }
public PhysicalUnitsType ( UnitsNet . UnitInfo units , ColumnType baseType )
{
Units = units ;
BaseType = baseType ;
}
2024-03-11 05:43:11 +00:00
}
public static readonly string [ ] ValidUnits =
{
"m" ,
"s" ,
"A" ,
"K" ,
"cd" ,
"mol" ,
"kg" ,
"Hz" ,
"rad" ,
"sr" ,
"N" ,
"Pa" ,
"J" ,
"W" ,
"C" ,
"V" ,
"F" ,
"Ω" ,
"S" ,
"Wb" ,
"T" ,
"H" ,
"°C" ,
"lm" ,
"lx" ,
"Bq" ,
"Gy" ,
"Sv" ,
"kat"
} ;
public static readonly int MajorVersion = 0 ;
public static readonly int MinorVersion = 0 ;
public static readonly int PatchVersion = 1 ;
public static Regex VersionRegex = new Regex ( @"^ ExtraTSV V(\d+)\.(\d+)\.(\d+)" ) ;
2024-02-14 22:30:36 +00:00
2024-02-15 00:16:23 +00:00
protected enum FormatType
{
2024-02-15 20:52:27 +00:00
SIMPLE_TSV = 0 ,
2024-02-15 00:16:23 +00:00
TYPED_TSV = 1 ,
COMMENTED_TSV = 2 ,
}
2024-02-17 20:54:32 +00:00
public static readonly byte [ ] TrueEncoded = Encoding . UTF8 . GetBytes ( "TRUE" ) ;
public static readonly byte [ ] FalseEncoded = Encoding . UTF8 . GetBytes ( "FALSE" ) ;
2024-02-17 05:26:35 +00:00
protected static bool? _littleEndian = null ;
public static bool LittleEndian
{
get
{
if ( _littleEndian = = null )
{
_littleEndian = BitConverter . GetBytes ( double . NegativeInfinity ) [ 7 ] = = 255 ;
}
return _littleEndian . Value ;
}
}
2024-02-14 03:15:07 +00:00
2024-02-23 06:09:13 +00:00
public static Tsv < T > ParseSimpleTsv < T > ( byte [ ] inputBuffer ) where T : TsvRecord , new ( )
2024-02-15 00:16:23 +00:00
{
2024-02-20 22:30:01 +00:00
return Parse < T > ( inputBuffer , FormatType . SIMPLE_TSV ) ;
2024-02-15 00:16:23 +00:00
}
2024-02-23 06:09:13 +00:00
public static Tsv < T > ParseTypedTsv < T > ( byte [ ] inputBuffer ) where T : TsvRecord , new ( )
2024-02-15 00:16:23 +00:00
{
2024-02-20 22:30:01 +00:00
return Parse < T > ( inputBuffer , FormatType . TYPED_TSV ) ;
2024-02-15 00:16:23 +00:00
}
2024-02-23 06:09:13 +00:00
public static CommentedTsv < T > ParseCommentedTsv < T > ( byte [ ] inputBuffer ) where T : CommentedTsvRecord , new ( )
2024-02-15 00:16:23 +00:00
{
2024-02-23 07:16:35 +00:00
return ( CommentedTsv < T > ) Parse < T > ( inputBuffer , FormatType . COMMENTED_TSV ) ;
2024-02-15 00:16:23 +00:00
}
2024-03-11 05:43:11 +00:00
public static CommentedTsv < T > ParseExtraTsv < T > ( byte [ ] inputBuffer ) where T : CommentedTsvRecord , new ( )
{
CommentedTsv < T > parsed = ( CommentedTsv < T > ) Parse < T > ( inputBuffer , FormatType . COMMENTED_TSV ) ;
if ( parsed . FileComment = = null )
{
throw new Exception ( $"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}" ) ;
}
Match match = VersionRegex . Match ( parsed . FileComment ) ;
if ( ! match . Success )
{
throw new Exception ( $"ExtraTSV expects the file to start with '# ExtraTSV Vx.y.z' where x.y.z is a version compatible with {MajorVersion}.{MinorVersion}.{PatchVersion}" ) ;
}
int fileMajorVersion = int . Parse ( match . Groups [ 1 ] . Value ) ;
if ( fileMajorVersion ! = MajorVersion )
{
throw new Exception ( $"File has major version ({fileMajorVersion}) which is newer than this parser's version {MajorVersion}" ) ;
}
return parsed ;
}
2024-02-23 06:09:13 +00:00
protected static Tsv < T > Parse < T > ( byte [ ] inputBuffer , FormatType format ) where T : TsvRecord , new ( )
2024-02-14 03:15:07 +00:00
{
2024-02-23 06:09:13 +00:00
Tsv < T > parsed ;
if ( format = = FormatType . COMMENTED_TSV )
{
parsed = new CommentedTsv < T > ( ) ;
}
else
{
parsed = new Tsv < T > ( ) ;
}
2024-02-20 22:30:01 +00:00
parsed . Records = new List < T > ( ) ;
2024-03-11 05:35:30 +00:00
var columnTypes = new List < ColumnType > ( ) ;
2024-03-09 21:13:41 +00:00
var columnNames = new List < string > ( ) ;
var columnPropertyInfos = new List < PropertyInfo > ( ) ;
2024-02-20 22:30:01 +00:00
int columnCount = 0 ;
foreach ( PropertyInfo property in typeof ( T ) . GetProperties ( ) )
{
2024-03-09 17:57:00 +00:00
TsvColumnAttribute attribute = ( TsvColumnAttribute ) Attribute . GetCustomAttribute ( property , typeof ( TsvColumnAttribute ) ) ;
2024-02-20 22:30:01 +00:00
if ( attribute = = null )
{
continue ;
}
2024-03-09 21:13:41 +00:00
columnNames . Add ( attribute . ColumnName ? ? property . Name ) ;
columnTypes . Add ( attribute . ColumnType ? ? GetColumnFromType ( property . PropertyType ) ) ;
columnPropertyInfos . Add ( property ) ;
2024-02-20 22:30:01 +00:00
// TODO: Check that the property type and given column type are compatible
columnCount + + ;
}
2024-02-14 03:15:07 +00:00
var fieldBytes = new List < byte > ( ) ;
2024-02-14 22:30:36 +00:00
var fields = new List < byte [ ] > ( ) ;
2024-02-15 02:31:58 +00:00
var currentComment = new StringBuilder ( ) ;
2024-02-26 06:35:56 +00:00
int numFields = - 1 ;
int line = 1 ;
int currentLineStart = 0 ;
for ( int i = 0 ; i < inputBuffer . Count ( ) ; i + + )
{
if ( inputBuffer [ i ] = = '\\' )
2024-02-17 05:26:35 +00:00
{
2024-02-26 06:35:56 +00:00
if ( i + 1 = = inputBuffer . Count ( ) )
2024-02-17 05:26:35 +00:00
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Found '\\' at end of input" ) ;
2024-02-17 05:26:35 +00:00
}
2024-02-26 06:35:56 +00:00
if ( inputBuffer [ i + 1 ] = = 'n' )
2024-02-17 05:26:35 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\n' ) ;
i + + ;
2024-02-17 05:26:35 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = '\\' )
2024-02-17 05:26:35 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\\' ) ;
i + + ;
2024-02-17 05:26:35 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = 't' )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\t' ) ;
i + + ;
2024-02-16 04:22:43 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = '#' )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '#' ) ;
i + + ;
2024-02-16 04:22:43 +00:00
}
else
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}" ) ;
2024-02-16 04:22:43 +00:00
}
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i ] = = '\t' )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
// end of field
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
}
else if ( inputBuffer [ i ] = = '\n' )
{
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
int numTypesBlank = 0 ;
for ( int j = 0 ; j < fields . Count ; j + + )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
string columnString ;
try
2024-02-18 04:44:07 +00:00
{
2024-02-26 06:35:56 +00:00
columnString = Encoding . UTF8 . GetString ( fields [ j ] ) ;
2024-02-18 04:44:07 +00:00
}
2024-02-26 06:35:56 +00:00
catch ( Exception e )
2024-02-18 04:44:07 +00:00
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Header field {fields.Count} is not valid UTF-8" , e ) ;
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:43:11 +00:00
string [ ] columnTypeStrings ;
2024-02-26 06:35:56 +00:00
string columnName ;
if ( columnString . Contains ( ':' ) )
{
if ( format = = FormatType . SIMPLE_TSV )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Header field {j} contains ':', which is not allowed for column names" ) ;
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:43:11 +00:00
columnTypeStrings = columnString . Split ( ":" ) ;
columnName = string . Join ( ":" , columnTypeStrings . Take ( columnTypeStrings . Length - 1 ) ) ;
2024-02-18 04:44:07 +00:00
}
else
{
2024-02-26 06:35:56 +00:00
if ( format > FormatType . SIMPLE_TSV )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Header field {fields.Count} has no type" ) ;
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:43:11 +00:00
columnTypeStrings = new string [ ] { "" } ;
2024-02-26 06:35:56 +00:00
columnName = columnString ;
2024-02-18 04:44:07 +00:00
}
2024-02-14 22:30:36 +00:00
2024-03-11 05:35:30 +00:00
ColumnType type ;
2024-02-26 06:35:56 +00:00
2024-03-11 05:43:11 +00:00
switch ( columnTypeStrings . Last ( ) )
2024-02-18 04:44:07 +00:00
{
2024-02-26 06:35:56 +00:00
case "" :
numTypesBlank + + ;
2024-03-11 05:35:30 +00:00
type = new StringType ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "string" :
2024-03-16 01:58:07 +00:00
if ( columnTypeStrings . Length > 2 & & columnTypeStrings [ columnTypeStrings . Length - 2 ] = = "iso8601" )
2024-03-11 05:43:11 +00:00
{
type = new Iso8601Type ( ) ;
columnName = string . Join ( ":" , columnTypeStrings . Take ( columnTypeStrings . Length - 2 ) ) ;
}
// TODO: ISO8601 time spans
// TODO: ISO8601 time durations
else
{
type = new StringType ( ) ;
}
2024-02-26 06:35:56 +00:00
break ;
case "boolean" :
2024-03-11 05:35:30 +00:00
type = new BooleanType ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "float32" :
2024-03-11 05:35:30 +00:00
type = new Float32Type ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "float32-le" :
2024-03-11 05:35:30 +00:00
type = new Float32LEType ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "float64" :
2024-03-16 01:58:07 +00:00
if ( columnTypeStrings . Length > 3 & & columnTypeStrings [ columnTypeStrings . Length - 2 ] = = UnitsTypeText )
{
string unitName = columnTypeStrings [ columnTypeStrings . Length - 3 ] ;
if ( UnitsNet . Quantity . TryFromUnitAbbreviation ( 1 , unitName , out UnitsNet . IQuantity quantity ) )
{
type = new PhysicalUnitsType ( UnitsNet . Quantity . GetUnitInfo ( quantity . Unit ) , new Float64Type ( ) ) ;
}
else
{
throw new Exception ( $"Invalid units: {unitName}" ) ;
}
columnName = string . Join ( ":" , columnTypeStrings . Take ( columnTypeStrings . Length - 3 ) ) ;
}
else
{
type = new Float64Type ( ) ;
}
2024-02-26 06:35:56 +00:00
break ;
case "float64-le" :
2024-03-16 01:58:07 +00:00
if ( columnTypeStrings . Length > 3 & & columnTypeStrings [ columnTypeStrings . Length - 2 ] = = UnitsTypeText )
{
string unitName = columnTypeStrings [ columnTypeStrings . Length - 3 ] ;
if ( UnitsNet . Quantity . TryFromUnitAbbreviation ( 1 , unitName , out UnitsNet . IQuantity quantity ) )
{
type = new PhysicalUnitsType ( UnitsNet . Quantity . GetUnitInfo ( quantity . Unit ) , new Float64LEType ( ) ) ;
}
else
{
throw new Exception ( $"Invalid units: {unitName}" ) ;
}
columnName = string . Join ( ":" , columnTypeStrings . Take ( columnTypeStrings . Length - 3 ) ) ;
}
else
{
type = new Float64LEType ( ) ;
}
2024-02-26 06:35:56 +00:00
break ;
case "uint32" :
2024-03-11 05:35:30 +00:00
type = new UInt32Type ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "uint64" :
2024-03-11 05:35:30 +00:00
type = new UInt64Type ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "int32" :
2024-03-11 05:35:30 +00:00
type = new Int32Type ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "int64" :
2024-03-11 05:35:30 +00:00
type = new Int64Type ( ) ;
2024-02-26 06:35:56 +00:00
break ;
case "binary" :
2024-03-11 05:35:30 +00:00
type = new BinaryType ( ) ;
2024-02-26 06:35:56 +00:00
break ;
default :
2024-03-11 05:43:11 +00:00
throw new Exception ( $"Invalid type '{columnTypeStrings.Last()}' for column {j}" ) ;
2024-02-18 04:44:07 +00:00
}
2024-02-26 06:35:56 +00:00
// TODO: Allow lax parsing (only worry about parsing columns that are given in the specifying type
2024-03-09 21:13:41 +00:00
if ( columnNames [ j ] ! = columnName )
2024-02-18 04:44:07 +00:00
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Column {j} has name {columnName}, but expected {columnNames[j]}" ) ;
2024-02-18 04:44:07 +00:00
}
2024-02-26 06:35:56 +00:00
2024-03-11 05:35:30 +00:00
if ( columnTypes [ j ] . GetType ( ) ! = type . GetType ( ) )
2024-02-18 04:44:07 +00:00
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Column {j} has type {type}, but expected {columnTypes[j]}" ) ;
2024-02-18 04:44:07 +00:00
}
2024-02-16 04:22:43 +00:00
}
2024-02-14 22:30:36 +00:00
2024-02-26 06:35:56 +00:00
if ( currentComment . Length > 0 )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
if ( parsed is CommentedTsv < T > commentedParsed )
{
commentedParsed . FileComment = currentComment . ToString ( ) ;
currentComment . Clear ( ) ;
}
else
{
throw new Exception ( "Found a file comment, but parser wasn't expecting a comment" ) ;
}
2024-02-16 04:22:43 +00:00
}
2024-02-14 22:30:36 +00:00
2024-02-26 06:35:56 +00:00
fields . Clear ( ) ;
2024-02-14 22:30:36 +00:00
2024-02-26 06:35:56 +00:00
line + + ;
currentLineStart = i + 1 ;
// Done parsing header
break ;
2024-02-16 04:22:43 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i ] = = '#' )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
if ( i = = currentLineStart & & format > = FormatType . COMMENTED_TSV )
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
int j = i ;
for ( ; j < inputBuffer . Length & & inputBuffer [ j ] ! = '\n' ; j + + ) { }
if ( j < inputBuffer . Length )
{
var commentBytes = new byte [ j - i - 1 ] ;
Array . Copy ( inputBuffer , i + 1 , commentBytes , 0 , j - i - 1 ) ;
if ( currentComment . Length > 0 )
{
currentComment . Append ( '\n' ) ;
}
currentComment . Append ( Encoding . UTF8 . GetString ( commentBytes ) ) ;
i = j ;
currentLineStart = i + 1 ;
line + + ;
}
else
{
throw new Exception ( "Comments at end of file are not allowed" ) ;
}
2024-02-16 04:22:43 +00:00
}
2024-02-26 06:35:56 +00:00
else
2024-02-16 04:22:43 +00:00
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Found unescaped '#' at line {line}, column {i - currentLineStart}" ) ;
2024-02-16 04:22:43 +00:00
}
}
else
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( inputBuffer [ i ] ) ;
2024-02-14 22:30:36 +00:00
}
2024-02-14 03:15:07 +00:00
}
2024-03-08 19:43:06 +00:00
// TODO: need to figure out where the crossover is
// Complication: it probably depends on processor count
if ( inputBuffer . Length < 10000 )
{
2024-03-09 21:13:41 +00:00
parsed . Records . AddRange ( Parse < T > ( inputBuffer , format , columnPropertyInfos . ToArray ( ) , columnTypes . ToArray ( ) , currentLineStart - 1 , inputBuffer . Length ) ) ;
2024-03-08 19:43:06 +00:00
return parsed ;
}
else
{
int parseStart = currentLineStart ;
int tasks = Environment . ProcessorCount - 1 ;
int splitCount = ( inputBuffer . Length - parseStart ) / tasks ;
T [ ] [ ] parsedValues = new T [ tasks ] [ ] ;
Parallel . For ( 0 , tasks , i = >
{
int startIndex = i * splitCount + parseStart - 1 ;
int endIndex ;
if ( i = = tasks - 1 )
{
endIndex = inputBuffer . Length ;
}
else
{
endIndex = ( i + 1 ) * splitCount + parseStart ;
}
2024-03-09 21:13:41 +00:00
parsedValues [ i ] = Parse < T > ( inputBuffer , format , columnPropertyInfos . ToArray ( ) , columnTypes . ToArray ( ) , startIndex , endIndex ) ;
2024-03-08 19:43:06 +00:00
} ) ;
2024-02-26 06:35:56 +00:00
2024-03-09 17:58:09 +00:00
// TODO: Handle relative line numbers
2024-03-08 19:43:06 +00:00
for ( int i = 0 ; i < tasks ; i + + )
{
parsed . Records . AddRange ( parsedValues [ i ] ) ;
}
}
2024-02-26 06:35:56 +00:00
return parsed ;
2024-02-14 03:15:07 +00:00
}
2024-02-26 06:35:56 +00:00
// This approach is slightly different than others. We skip the record that startIndex is in and
// include the record that endIndex is in. We do this because in order to include the record
// startIndex is in we'd have to go back to the start of the record's comment, and to know
// exactly where that comment started we'd have to go back to the start of the record before that
// (not including that other record's comment).
2024-03-11 05:35:30 +00:00
protected static T [ ] Parse < T > ( byte [ ] inputBuffer , FormatType format , PropertyInfo [ ] columnPropertyInfos , ColumnType [ ] columnTypes , int startIndex , int endIndex ) where T : TsvRecord , new ( )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
var fieldBytes = new List < byte > ( ) ;
var fields = new List < byte [ ] > ( ) ;
var currentComment = new StringBuilder ( ) ;
List < T > parsed = new List < T > ( ) ;
bool parsingLastRecord = false ;
2024-02-15 19:57:45 +00:00
2024-02-26 06:35:56 +00:00
int relativeLine = 0 ;
2024-02-15 19:57:45 +00:00
2024-02-26 06:35:56 +00:00
int i = startIndex ;
2024-03-11 05:43:11 +00:00
while ( i < inputBuffer . Length - 1 & & inputBuffer [ i ] ! = '\n' & & inputBuffer [ i + 1 ] ! = '#' )
2024-02-26 06:35:56 +00:00
{
i + + ;
}
2024-03-11 05:43:11 +00:00
if ( i > = inputBuffer . Length - 1 )
2024-02-26 06:35:56 +00:00
{
return Array . Empty < T > ( ) ;
}
// Start parsing after \n
i + + ;
int currentLineStart = i ;
for ( ; i < inputBuffer . Length & & ( i < endIndex | | parsingLastRecord ) ; i + + )
{
if ( inputBuffer [ i ] = = '\\' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
if ( i + 1 = = inputBuffer . Count ( ) )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Found '\\' at end of input" ) ;
2024-02-15 19:57:45 +00:00
}
2024-02-26 06:35:56 +00:00
if ( inputBuffer [ i + 1 ] = = 'n' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\n' ) ;
i + + ;
2024-02-15 19:57:45 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = '\\' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\\' ) ;
i + + ;
2024-02-15 19:57:45 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = 't' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '\t' ) ;
i + + ;
2024-02-15 19:57:45 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i + 1 ] = = '#' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
fieldBytes . Add ( ( byte ) '#' ) ;
i + + ;
2024-02-15 19:57:45 +00:00
}
else
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Expected 'n', 't', '#', or '\\' after '\\' at line {relativeLine} column {i - currentLineStart}" ) ;
2024-02-15 19:57:45 +00:00
}
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i ] = = '\t' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
// end of field
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
2024-02-15 19:57:45 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( inputBuffer [ i ] = = '\n' )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
2024-02-15 19:57:45 +00:00
2024-03-09 21:13:41 +00:00
if ( columnTypes . Length ! = fields . Count )
2024-02-15 19:57:45 +00:00
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Expected {columnTypes.Length} fields on line {relativeLine}, but found {fields.Count}" ) ;
2024-02-26 06:35:56 +00:00
}
else
{
string comment = null ;
if ( currentComment . Length > 0 )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
comment = currentComment . ToString ( ) ;
currentComment . Clear ( ) ;
2024-02-15 19:57:45 +00:00
}
2024-03-09 21:13:41 +00:00
parsed . Add ( ParseCurrentRecord < T > ( columnTypes . ToArray ( ) , columnPropertyInfos . ToArray ( ) , fields , comment , relativeLine ) ) ;
2024-02-26 06:35:56 +00:00
fields . Clear ( ) ;
}
parsingLastRecord = false ;
relativeLine + + ;
currentLineStart = i + 1 ;
}
else if ( inputBuffer [ i ] = = '#' )
{
if ( i = = currentLineStart & & format > = FormatType . COMMENTED_TSV )
{
int j = i ;
for ( ; j < inputBuffer . Length & & inputBuffer [ j ] ! = '\n' ; j + + ) { }
if ( j < inputBuffer . Length )
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
var commentBytes = new byte [ j - i - 1 ] ;
Array . Copy ( inputBuffer , i + 1 , commentBytes , 0 , j - i - 1 ) ;
if ( currentComment . Length > 0 )
{
currentComment . Append ( '\n' ) ;
}
currentComment . Append ( Encoding . UTF8 . GetString ( commentBytes ) ) ;
i = j ;
currentLineStart = i + 1 ;
relativeLine + + ;
2024-02-15 19:57:45 +00:00
}
else
{
2024-02-26 06:35:56 +00:00
throw new Exception ( "Comments at end of file are not allowed" ) ;
2024-02-15 19:57:45 +00:00
}
}
2024-02-26 06:35:56 +00:00
else
2024-02-15 19:57:45 +00:00
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Found unescaped '#' at line {relativeLine}, column {i - currentLineStart}" ) ;
2024-02-15 19:57:45 +00:00
}
}
2024-02-26 06:35:56 +00:00
else
{
fieldBytes . Add ( inputBuffer [ i ] ) ;
}
if ( i = = endIndex - 1 )
{
parsingLastRecord = true ;
}
2024-02-15 19:57:45 +00:00
}
2024-03-08 19:43:06 +00:00
if ( endIndex < inputBuffer . Length )
{
return parsed . ToArray ( ) ;
}
2024-02-26 06:35:56 +00:00
fields . Add ( fieldBytes . ToArray ( ) ) ;
2024-03-11 05:43:11 +00:00
2024-02-26 06:35:56 +00:00
if ( fields . Count = = 0 )
{
// TODO
throw new Exception ( "Not sure when this will happen. THis might actuall be fine" ) ;
}
2024-03-09 21:13:41 +00:00
if ( fields . Count ! = columnTypes . Length )
2024-02-26 06:35:56 +00:00
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Expected {columnTypes} fields on line {relativeLine}, but found {fields.Count}" ) ;
2024-02-26 06:35:56 +00:00
}
else
{
string comment = null ;
if ( currentComment . Length > 0 )
{
comment = currentComment . ToString ( ) ;
currentComment . Clear ( ) ;
}
2024-03-09 21:13:41 +00:00
parsed . Add ( ParseCurrentRecord < T > ( columnTypes . ToArray ( ) , columnPropertyInfos . ToArray ( ) , fields , comment , relativeLine ) ) ;
2024-02-26 06:35:56 +00:00
fields . Clear ( ) ;
}
return parsed . ToArray ( ) ;
2024-02-15 19:57:45 +00:00
}
2024-03-11 05:35:30 +00:00
protected static T ParseCurrentCommentedRecord < T > ( ColumnType [ ] columnTypes , PropertyInfo [ ] properties , List < byte [ ] > fields , string comment , int line ) where T : CommentedTsvRecord , new ( )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
return ( T ) ParseCurrentRecord < T > ( columnTypes , properties , fields , comment , line ) ;
}
2024-02-25 19:24:30 +00:00
2024-03-11 05:35:30 +00:00
protected static T ParseCurrentRecord < T > ( ColumnType [ ] columnTypes , PropertyInfo [ ] properties , List < byte [ ] > fields , string comment , int line ) where T : TsvRecord , new ( )
2024-02-26 06:35:56 +00:00
{
T record = new T ( ) ;
if ( record is CommentedTsvRecord commentedRecord )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
commentedRecord . Comment = comment ;
}
else if ( comment ! = null )
{
throw new Exception ( $"Found comment for line {line}, but format does not support comments" ) ;
}
2024-02-25 19:24:30 +00:00
2024-02-26 06:35:56 +00:00
record . Line = line ;
for ( int j = 0 ; j < fields . Count ; j + + )
{
// All other types require the content to be UTF-8. Binary fields can ignore that.
2024-03-11 05:35:30 +00:00
if ( columnTypes [ j ] . GetType ( ) = = typeof ( BinaryType ) )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
// TODO: Use faster method for property setting
// e.g. https://blog.marcgravell.com/2012/01/playing-with-your-member.html
// or https://stackoverflow.com/questions/1027980/improving-performance-reflection-what-alternatives-should-i-consider
// or https://stackoverflow.com/questions/12767091/why-are-propertyinfo-setvalue-and-getvalue-so-slow
properties [ j ] . SetValue ( record , fields [ j ] ) ;
continue ;
2024-02-25 19:24:30 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float32LEType ) )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
byte [ ] floatBytes ;
if ( ! LittleEndian )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
floatBytes = new byte [ sizeof ( float ) ] ;
for ( int k = 0 ; k < sizeof ( float ) ; k + + )
{
floatBytes [ k ] = fields [ j ] [ sizeof ( float ) - 1 - k ] ;
}
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
else
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
floatBytes = fields [ j ] ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
properties [ j ] . SetValue ( record , BitConverter . ToSingle ( floatBytes , 0 ) ) ;
continue ;
}
2024-03-16 01:58:07 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float64LEType ) | | ( columnTypes [ j ] is PhysicalUnitsType f64PhUnit & & f64PhUnit . BaseType is Float64LEType ) )
2024-02-26 06:35:56 +00:00
{
byte [ ] floatBytes ;
if ( ! LittleEndian )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
floatBytes = new byte [ sizeof ( double ) ] ;
for ( int k = 0 ; k < sizeof ( double ) ; k + + )
{
floatBytes [ k ] = fields [ j ] [ sizeof ( double ) - 1 - k ] ;
}
2024-02-25 19:24:30 +00:00
}
else
{
2024-02-26 06:35:56 +00:00
floatBytes = fields [ j ] ;
2024-02-25 19:24:30 +00:00
}
2024-03-16 01:58:07 +00:00
double value = BitConverter . ToDouble ( floatBytes , 0 ) ;
if ( columnTypes [ j ] is PhysicalUnitsType unit )
{
properties [ j ] . SetValue ( record , UnitsNet . Quantity . From ( value , unit . Units . Value ) ) ;
}
else
{
properties [ j ] . SetValue ( record , value ) ;
}
2024-02-26 06:35:56 +00:00
continue ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
string fieldString ;
try
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
fieldString = Encoding . UTF8 . GetString ( fields [ j ] ) ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
catch ( Exception e )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
throw new Exception ( $"Field {j} on line {line} is not valid UTF-8" , e ) ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
// TODO: Add checking for numeric types format
2024-02-25 19:24:30 +00:00
2024-03-11 05:35:30 +00:00
if ( columnTypes [ j ] . GetType ( ) = = typeof ( StringType ) )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
properties [ j ] . SetValue ( record , fieldString ) ;
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( BooleanType ) )
2024-02-26 06:35:56 +00:00
{
bool parsedBool ;
if ( fieldString = = "TRUE" )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
parsedBool = true ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
else if ( fieldString = = "FALSE" )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
parsedBool = false ;
}
else
{
throw new Exception ( $"Field {j} on line {line} is not valid boolean. Must be 'TRUE' or 'FALSE' exactly" ) ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
properties [ j ] . SetValue ( record , parsedBool ) ;
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float32Type ) )
2024-02-25 19:24:30 +00:00
{
2024-02-26 06:35:56 +00:00
float parsedFloat ;
if ( ! float . TryParse ( fieldString , out parsedFloat ) )
{
if ( fieldString = = "-inf" )
{
parsedFloat = float . NegativeInfinity ;
}
else if ( fieldString = = "+inf" )
{
parsedFloat = float . PositiveInfinity ;
}
else
{
throw new Exception ( $"Field {j} on line {line} is not valid single-precision float" ) ;
}
}
properties [ j ] . SetValue ( record , parsedFloat ) ;
}
2024-03-16 01:58:07 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float64Type ) | | ( columnTypes [ j ] is PhysicalUnitsType f64PhUnit & & f64PhUnit . BaseType is Float64Type ) )
2024-02-26 06:35:56 +00:00
{
double parsedDouble ;
if ( ! double . TryParse ( fieldString , out parsedDouble ) )
{
if ( fieldString = = "-inf" )
{
parsedDouble = float . NegativeInfinity ;
}
else if ( fieldString = = "+inf" )
{
parsedDouble = float . PositiveInfinity ;
}
else
{
throw new Exception ( $"Field {j} on line {line} is not valid double-precision float" ) ;
}
}
2024-03-16 01:58:07 +00:00
if ( columnTypes [ j ] is PhysicalUnitsType unit )
{
properties [ j ] . SetValue ( record , UnitsNet . Quantity . From ( parsedDouble , unit . Units . Value ) ) ;
}
else
{
properties [ j ] . SetValue ( record , parsedDouble ) ;
}
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( UInt32Type ) )
2024-02-26 06:35:56 +00:00
{
if ( ! UInt32 . TryParse ( fieldString , out UInt32 parsedUInt32 ) )
{
throw new Exception ( $"Field {j} on line {line} is not valid UInt32" ) ;
}
properties [ j ] . SetValue ( record , parsedUInt32 ) ;
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( UInt64Type ) )
2024-02-26 06:35:56 +00:00
{
if ( ! UInt64 . TryParse ( fieldString , out UInt64 parsedUInt64 ) )
{
throw new Exception ( $"Field {j} on line {line} is not valid UInt64" ) ;
}
properties [ j ] . SetValue ( record , parsedUInt64 ) ;
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Int32Type ) )
2024-02-26 06:35:56 +00:00
{
if ( ! Int32 . TryParse ( fieldString , out Int32 parsedInt32 ) )
{
throw new Exception ( $"Field {j} on line {line} is not valid Int32" ) ;
}
properties [ j ] . SetValue ( record , parsedInt32 ) ;
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Int64Type ) )
2024-02-26 06:35:56 +00:00
{
if ( ! Int64 . TryParse ( fieldString , out Int64 parsedInt64 ) )
{
throw new Exception ( $"Field {j} on line {line} is not valid Int64" ) ;
}
properties [ j ] . SetValue ( record , parsedInt64 ) ;
}
2024-03-11 05:43:11 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Iso8601Type ) )
{
if ( ! DateTime . TryParseExact ( fieldString , "yyyy-MM-ddTHH:mm:ss.ffff" , CultureInfo . InvariantCulture , DateTimeStyles . None , out DateTime parsed ) )
{
throw new Exception ( $"ISO 8601 timestamp format error on line {line}, field {j}" ) ;
}
properties [ j ] . SetValue ( record , parsed ) ;
}
2024-02-26 06:35:56 +00:00
else
{
throw new Exception ( $"Unexpected type {columnTypes[j]}" ) ;
2024-02-25 19:24:30 +00:00
}
}
2024-02-26 06:35:56 +00:00
return record ;
2024-02-25 19:24:30 +00:00
}
2024-02-26 06:35:56 +00:00
public static byte [ ] SerializeSimpleTsv ( IList < string > header , IList < IList < string > > data )
2024-02-25 19:24:30 +00:00
{
2024-03-08 20:31:40 +00:00
var serialized = new List < byte > ( ) ;
2024-02-25 19:24:30 +00:00
var escapedString = new StringBuilder ( ) ;
2024-02-26 06:35:56 +00:00
// Serialize header
for ( int i = 0 ; i < header . Count ; i + + )
{
if ( header [ i ] . Contains ( ':' ) )
{
throw new Exception ( $"Column {i} contains the character ':'" ) ;
}
for ( int j = i + 1 ; j < header . Count ; j + + )
{
if ( header [ i ] = = header [ j ] )
{
throw new Exception ( "Column names in header must be unique" ) ;
}
}
for ( int j = 0 ; j < header [ i ] . Count ( ) ; j + + )
{
if ( header [ i ] [ j ] = = '\n' )
{
escapedString . Append ( "\\n" ) ;
}
else if ( header [ i ] [ j ] = = '\t' )
{
escapedString . Append ( "\\t" ) ;
}
else if ( header [ i ] [ j ] = = '\\' )
{
escapedString . Append ( "\\\\" ) ;
}
else if ( header [ i ] [ j ] = = '#' )
{
escapedString . Append ( "\\#" ) ;
}
else
{
escapedString . Append ( header [ i ] [ j ] ) ;
}
}
if ( i = = header . Count - 1 )
{
escapedString . Append ( '\n' ) ;
}
else
{
escapedString . Append ( '\t' ) ;
}
}
2024-03-08 20:31:40 +00:00
serialized . AddRange ( Encoding . UTF8 . GetBytes ( escapedString . ToString ( ) ) ) ;
// TODO: need to figure out where the crossover it
// Complication: it probably depends on processor count
if ( data . Count < 100 )
{
serialized . AddRange ( Encoding . UTF8 . GetBytes ( SerializeSimpleTsv ( data , 0 , data . Count ) ) ) ;
}
else
{
int tasks = Environment . ProcessorCount - 1 ;
int splitCount = data . Count / tasks ;
byte [ ] [ ] bytes = new byte [ tasks ] [ ] ;
Parallel . For ( 0 , tasks , i = >
{
int endIndex ;
if ( i = = tasks - 1 )
{
endIndex = data . Count ;
}
else
{
endIndex = ( i + 1 ) * splitCount ;
}
string escapedString = SerializeSimpleTsv ( data , i * splitCount , endIndex ) ;
bytes [ i ] = Encoding . UTF8 . GetBytes ( escapedString ) ;
} ) ;
for ( int i = 0 ; i < tasks ; i + + )
{
serialized . AddRange ( bytes [ i ] ) ;
}
}
return serialized . ToArray ( ) ;
}
public static string SerializeSimpleTsv ( IList < IList < string > > data , int startIndex , int endIndex )
{
var escapedString = new StringBuilder ( ) ;
2024-02-25 19:24:30 +00:00
// Serialize data
2024-03-08 20:31:40 +00:00
for ( int i = startIndex ; i < endIndex ; i + + )
2024-02-25 19:24:30 +00:00
{
for ( int j = 0 ; j < data [ i ] . Count ; j + + )
{
for ( int k = 0 ; k < data [ i ] [ j ] . Length ; k + + )
{
if ( data [ i ] [ j ] [ k ] = = '\n' )
{
escapedString . Append ( "\\n" ) ;
}
else if ( data [ i ] [ j ] [ k ] = = '\t' )
{
escapedString . Append ( "\\t" ) ;
}
else if ( data [ i ] [ j ] [ k ] = = '\\' )
{
escapedString . Append ( "\\\\" ) ;
}
else if ( data [ i ] [ j ] [ k ] = = '#' )
{
escapedString . Append ( "\\#" ) ;
}
else
{
escapedString . Append ( data [ i ] [ j ] [ k ] ) ;
}
}
if ( j < data [ i ] . Count - 1 )
{
escapedString . Append ( '\t' ) ;
}
else if ( i < data . Count - 1 )
{
escapedString . Append ( '\n' ) ;
}
}
}
2024-03-08 20:31:40 +00:00
return escapedString . ToString ( ) ;
2024-02-25 19:24:30 +00:00
}
2024-03-08 20:31:40 +00:00
public static ( string [ ] columns , string [ ] [ ] data ) ParseSimpleTsv ( byte [ ] inputBuffer )
2024-02-23 07:16:35 +00:00
{
2024-03-08 20:31:40 +00:00
string [ ] columnNames = null ;
2024-03-09 21:13:41 +00:00
var headerFields = new List < byte [ ] > ( ) ;
2024-03-08 20:31:40 +00:00
var fieldBytes = new List < byte > ( ) ;
int startOfData = - 1 ;
for ( int i = 0 ; i < inputBuffer . Count ( ) ; i + + )
2024-02-23 07:16:35 +00:00
{
2024-03-08 20:31:40 +00:00
if ( inputBuffer [ i ] = = '\\' )
2024-02-25 19:24:30 +00:00
{
if ( i + 1 = = inputBuffer . Count ( ) )
{
throw new Exception ( $"Found '\\' at end of input" ) ;
}
if ( inputBuffer [ i + 1 ] = = 'n' )
{
fieldBytes . Add ( ( byte ) '\n' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = '\\' )
{
fieldBytes . Add ( ( byte ) '\\' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = 't' )
{
fieldBytes . Add ( ( byte ) '\t' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = '#' )
{
fieldBytes . Add ( ( byte ) '#' ) ;
i + + ;
}
else
{
throw new Exception ( $"Expected 'n', 't', '#', or '\\' after '\\' at line {1} column {i}" ) ;
}
}
else if ( inputBuffer [ i ] = = '\t' )
{
// end of field
2024-03-09 21:13:41 +00:00
headerFields . Add ( fieldBytes . ToArray ( ) ) ;
2024-02-25 19:24:30 +00:00
fieldBytes . Clear ( ) ;
}
else if ( inputBuffer [ i ] = = '\n' )
{
// This is the end of the header
2024-03-09 21:13:41 +00:00
headerFields . Add ( fieldBytes . ToArray ( ) ) ;
2024-02-25 19:24:30 +00:00
startOfData = i + 1 ;
2024-03-09 21:13:41 +00:00
columnNames = new string [ headerFields . Count ] ;
2024-02-25 19:24:30 +00:00
fieldBytes . Clear ( ) ;
2024-03-09 21:13:41 +00:00
for ( int j = 0 ; j < headerFields . Count ; j + + )
2024-02-25 19:24:30 +00:00
{
string columnString ;
try
{
2024-03-09 21:13:41 +00:00
columnString = Encoding . UTF8 . GetString ( headerFields [ j ] ) ;
2024-02-25 19:24:30 +00:00
}
catch ( Exception e )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Column {headerFields.Count} name is not valid UTF-8" , e ) ;
2024-02-25 19:24:30 +00:00
}
if ( columnString . Contains ( ':' ) )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Header field {headerFields.Count} contain ':', which is not allowed for column names" ) ;
2024-02-25 19:24:30 +00:00
}
columnNames [ j ] = columnString ;
}
// Done parsing header
break ;
}
else if ( inputBuffer [ i ] = = '#' )
{
throw new Exception ( $"Found unescaped '#' at line 1, column {i}" ) ;
}
else
{
fieldBytes . Add ( inputBuffer [ i ] ) ;
}
}
2024-03-08 20:31:40 +00:00
return ( columnNames , ParseSimpleTsv ( inputBuffer , columnNames . Length , startOfData , inputBuffer . Length ) ) ;
2024-02-25 19:24:30 +00:00
}
2024-03-08 20:31:40 +00:00
public static string [ ] [ ] ParseSimpleTsv ( byte [ ] inputBuffer , int numFields , int startIndex , int endIndex )
2024-02-25 19:24:30 +00:00
{
var fieldBytes = new List < byte > ( ) ;
var fields = new List < byte [ ] > ( ) ;
var records = new List < string [ ] > ( ) ;
2024-03-09 17:57:27 +00:00
int line = 2 ;
2024-02-25 19:24:30 +00:00
int currentLineStart = 0 ;
// Go back to the start of the current line
int i = startIndex ;
while ( inputBuffer [ i ] ! = '\n' )
{
i - - ;
}
// We want to start at the first byte of the current line
i + + ;
for ( ; i < endIndex ; i + + )
{
if ( inputBuffer [ i ] = = '\\' )
{
if ( i + 1 = = inputBuffer . Count ( ) )
{
throw new Exception ( $"Found '\\' at end of input" ) ;
}
if ( inputBuffer [ i + 1 ] = = 'n' )
{
fieldBytes . Add ( ( byte ) '\n' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = '\\' )
{
fieldBytes . Add ( ( byte ) '\\' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = 't' )
{
fieldBytes . Add ( ( byte ) '\t' ) ;
i + + ;
}
else if ( inputBuffer [ i + 1 ] = = '#' )
{
fieldBytes . Add ( ( byte ) '#' ) ;
i + + ;
}
else
{
throw new Exception ( $"Expected 'n', 't', '#', or '\\' after '\\' at line {line} column {i - currentLineStart}" ) ;
}
}
else if ( inputBuffer [ i ] = = '\t' )
{
// end of field
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
}
else if ( inputBuffer [ i ] = = '\n' )
{
fields . Add ( fieldBytes . ToArray ( ) ) ;
fieldBytes . Clear ( ) ;
if ( numFields ! = fields . Count )
{
throw new Exception ( $"Expected {numFields} fields on line {line}, but found {fields.Count}" ) ;
}
else
{
var fieldStrings = new string [ fields . Count ] ;
for ( int j = 0 ; j < fields . Count ; j + + )
{
2024-02-26 06:35:56 +00:00
try
{
fieldStrings [ j ] = Encoding . UTF8 . GetString ( fields [ j ] ) ;
}
catch ( Exception e )
{
throw new Exception ( $"Line {line}, column {j} is not valid UTF-8" , e ) ;
}
}
records . Add ( fieldStrings ) ;
fields . Clear ( ) ;
}
line + + ;
currentLineStart = i + 1 ;
}
else if ( inputBuffer [ i ] = = '#' )
{
throw new Exception ( $"Found unescaped '#' at line {line}, column {i - currentLineStart}" ) ;
}
else
{
fieldBytes . Add ( inputBuffer [ i ] ) ;
}
}
fields . Add ( fieldBytes . ToArray ( ) ) ;
2024-03-09 17:57:56 +00:00
if ( fields . Count = = 0 & & endIndex = = inputBuffer . Length )
2024-02-26 06:35:56 +00:00
{
throw new Exception ( "Found 0 fields on last line. Possibly because of extra \\n after last record" ) ;
}
if ( numFields ! = fields . Count )
{
2024-03-09 17:57:56 +00:00
if ( endIndex = = inputBuffer . Length )
{
throw new Exception ( $"Expected {numFields} fields on line {line}, but found {fields.Count}" ) ;
}
else
{
return records . ToArray ( ) ;
}
2024-02-26 06:35:56 +00:00
}
else
{
var fieldStrings = new string [ fields . Count ] ;
for ( int j = 0 ; j < fields . Count ; j + + )
{
try
{
fieldStrings [ j ] = Encoding . UTF8 . GetString ( fields [ j ] ) ;
}
catch ( Exception e )
{
throw new Exception ( $"Line {line}, column {j} is not valid UTF-8" , e ) ;
}
}
records . Add ( fieldStrings ) ;
fields . Clear ( ) ;
}
return records . ToArray ( ) ;
}
2024-03-16 01:58:07 +00:00
public static string UnitsTypeText = "ph-unit" ;
public static Regex UnitsRegex = new Regex ( "([^:]+):" + UnitsTypeText + ":(float32|float32-le|float64|float64-le|uint32|uint64|int32|int64)" ) ;
2024-03-11 05:35:30 +00:00
public static ColumnType GetColumnFromString ( string type )
{
if ( type = = "string" )
{
return new StringType ( ) ;
}
else if ( type = = "boolean" )
{
return new BooleanType ( ) ;
}
else if ( type = = "float32" )
{
return new Float32Type ( ) ;
}
else if ( type = = "float32-le" )
{
return new Float32LEType ( ) ;
}
else if ( type = = "float64" )
{
return new Float64Type ( ) ;
}
else if ( type = = "float64-le" )
{
return new Float64LEType ( ) ;
}
else if ( type = = "uint32" )
{
return new UInt32Type ( ) ;
}
else if ( type = = "uint64" )
{
return new UInt64Type ( ) ;
}
else if ( type = = "int32" )
{
return new Int32Type ( ) ;
}
else if ( type = = "int64" )
{
return new Int64Type ( ) ;
}
else if ( type = = "binary" )
{
return new BinaryType ( ) ;
}
2024-03-16 01:58:07 +00:00
else if ( type = = "iso8601" )
{
return new Iso8601Type ( ) ;
}
else if ( UnitsRegex . IsMatch ( type ) )
{
Match match = UnitsRegex . Match ( type ) ;
string unitName = match . Groups [ 1 ] . Value ;
string baseType = match . Groups [ 2 ] . Value ;
return new PhysicalUnitsType ( ParseUnit ( unitName ) , GetColumnFromString ( baseType ) ) ;
//if (UnitsNet.Quantity.TryFromUnitAbbreviation(1, unitName, out UnitsNet.IQuantity quantity))
//{
// return new PhysicalUnitsType(UnitsNet.Quantity.GetUnitInfo(quantity.Unit), GetColumnFromString(baseType));
//}
//else
//{
// throw new Exception($"Invalid units: {unitName}");
//}
}
2024-03-11 05:35:30 +00:00
else
{
throw new Exception ( $"Invalid type: {type.GetType()}" ) ;
}
}
public static ColumnType GetColumnFromType ( Type type )
2024-02-26 06:35:56 +00:00
{
if ( type = = typeof ( string ) )
{
2024-03-11 05:35:30 +00:00
return new StringType ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( bool ) )
{
2024-03-11 05:35:30 +00:00
return new BooleanType ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( float ) )
{
2024-03-11 05:35:30 +00:00
return new Float32Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( double ) )
{
2024-03-11 05:35:30 +00:00
return new Float64Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( UInt32 ) )
{
2024-03-11 05:35:30 +00:00
return new UInt32Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( UInt64 ) )
{
2024-03-11 05:35:30 +00:00
return new UInt64Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( Int32 ) )
{
2024-03-11 05:35:30 +00:00
return new Int32Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( Int64 ) )
{
2024-03-11 05:35:30 +00:00
return new Int64Type ( ) ;
2024-02-26 06:35:56 +00:00
}
else if ( type = = typeof ( byte [ ] ) )
{
2024-03-11 05:35:30 +00:00
return new BinaryType ( ) ;
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:43:11 +00:00
else if ( type = = typeof ( DateTime ) )
{
return new Iso8601Type ( ) ;
}
2024-03-16 01:58:07 +00:00
else if ( type = = typeof ( UnitsNet . Mass ) )
{
// TODO
//UnitsNet.UnitInfo a = new UnitsNet.UnitInfo([d])
var a = new UnitsNet . UnitInfo < UnitsNet . Units . MassUnit > ( UnitsNet . Units . MassUnit . Kilogram , "kgs" , new UnitsNet . BaseUnits ( mass : UnitsNet . Units . MassUnit . Kilogram ) ) ;
return new PhysicalUnitsType ( a , new Float64Type ( ) ) ;
}
2024-02-26 06:35:56 +00:00
else
{
2024-03-11 05:35:30 +00:00
throw new Exception ( $"Invalid type: {type.GetType()}" ) ;
2024-02-26 06:35:56 +00:00
}
}
2024-03-11 05:35:30 +00:00
public static string GetNameFromColumn ( ColumnType type )
2024-02-26 06:35:56 +00:00
{
2024-03-11 05:35:30 +00:00
if ( type . GetType ( ) = = typeof ( StringType ) )
2024-02-26 06:35:56 +00:00
{
return "string" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( BooleanType ) )
2024-02-26 06:35:56 +00:00
{
return "boolean" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Float32Type ) )
2024-02-26 06:35:56 +00:00
{
return "float32" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Float32LEType ) )
2024-02-26 06:35:56 +00:00
{
return "float32-le" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Float64Type ) )
2024-02-26 06:35:56 +00:00
{
return "float64" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Float64LEType ) )
2024-02-26 06:35:56 +00:00
{
return "float64-le" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( UInt32Type ) )
2024-02-26 06:35:56 +00:00
{
return "uint32" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( UInt64Type ) )
2024-02-26 06:35:56 +00:00
{
return "uint64" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Int32Type ) )
2024-02-26 06:35:56 +00:00
{
return "int32" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( Int64Type ) )
2024-02-26 06:35:56 +00:00
{
return "int64" ;
}
2024-03-11 05:35:30 +00:00
else if ( type . GetType ( ) = = typeof ( BinaryType ) )
2024-02-26 06:35:56 +00:00
{
return "binary" ;
}
2024-03-11 05:43:11 +00:00
else if ( type . GetType ( ) = = typeof ( Iso8601Type ) )
{
return "iso8601:string" ;
}
2024-03-16 01:58:07 +00:00
else if ( type is PhysicalUnitsType unit )
{
return $"{unit.Units.Name}:{UnitsTypeText}:{GetNameFromColumn(unit.BaseType)}" ;
}
2024-02-26 06:35:56 +00:00
else
{
2024-03-11 05:35:30 +00:00
throw new Exception ( $"Invalid type: {type.GetType()}" ) ;
2024-02-26 06:35:56 +00:00
}
}
public static byte [ ] SerializeSimpleTsv < T > ( IList < T > data ) where T : TsvRecord
{
return SerializeTsv < T > ( data , FormatType . SIMPLE_TSV ) ;
}
public static byte [ ] SerializeTypedTsv < T > ( IList < T > data ) where T : TsvRecord
{
return SerializeTsv < T > ( data , FormatType . TYPED_TSV ) ;
}
public static byte [ ] SerializeCommentedTsv < T > ( IList < T > data , string fileComment ) where T : CommentedTsvRecord
{
2024-03-11 05:35:30 +00:00
return SerializeTsv < T > ( data , FormatType . COMMENTED_TSV , fileComment ) ;
2024-02-26 06:35:56 +00:00
}
2024-03-11 05:43:11 +00:00
public static byte [ ] SerializeExtraTsv < T > ( IList < T > data ) where T : TsvRecord
{
return SerializeTsv < T > ( data , FormatType . COMMENTED_TSV , $" ExtraTSV V{MajorVersion}.{MinorVersion}.{PatchVersion}" ) ;
}
2024-03-11 05:35:30 +00:00
protected static byte [ ] SerializeTsv < T > ( IList < T > data , FormatType tsvFormat , string fileComment = null )
2024-02-17 20:54:32 +00:00
{
var bytes = new List < byte > ( ) ;
2024-03-11 05:35:30 +00:00
if ( fileComment ! = null )
{
if ( tsvFormat ! = FormatType . COMMENTED_TSV )
{
throw new Exception ( $"File comments are not valid for {tsvFormat}" ) ;
}
bytes . AddRange ( Encoding . UTF8 . GetBytes ( "#" + fileComment . Replace ( "\n" , "\n#" ) + "\n" ) ) ;
}
var columnTypes = new List < ColumnType > ( ) ;
2024-03-09 21:13:41 +00:00
var columnNames = new List < string > ( ) ;
var columnPropertyInfos = new List < PropertyInfo > ( ) ;
2024-02-20 22:30:01 +00:00
int columnCount = 0 ;
2024-02-26 06:35:56 +00:00
// Serialize header
2024-02-20 22:30:01 +00:00
foreach ( PropertyInfo property in typeof ( T ) . GetProperties ( ) )
2024-02-17 20:54:32 +00:00
{
2024-02-23 06:09:13 +00:00
TsvColumnAttribute attribute = ( TsvColumnAttribute ) Attribute . GetCustomAttribute ( property , typeof ( TsvColumnAttribute ) ) ;
2024-02-20 22:30:01 +00:00
if ( attribute = = null )
{
continue ;
}
2024-02-23 06:09:13 +00:00
string headerName = attribute . ColumnName ? ? property . Name ;
2024-03-09 21:13:41 +00:00
columnNames . Add ( headerName ) ;
2024-03-11 05:35:30 +00:00
ColumnType headerType = attribute . ColumnType ? ? GetColumnFromType ( property . PropertyType ) ;
if ( tsvFormat = = FormatType . SIMPLE_TSV & & headerType . GetType ( ) ! = typeof ( StringType ) )
2024-02-23 06:09:13 +00:00
{
throw new Exception ( $"Serializing Simple TSV requires all columns be of type string, but column '{headerName}' has type '{headerType}'" ) ;
}
2024-03-09 21:13:41 +00:00
columnTypes . Add ( headerType ) ;
columnPropertyInfos . Add ( property ) ;
2024-02-20 22:30:01 +00:00
// TODO: Check that the property type and given column type are compatible
columnCount + + ;
2024-02-17 20:54:32 +00:00
}
// Serialize header
2024-03-09 21:13:41 +00:00
for ( int i = 0 ; i < columnNames . Count ; i + + )
2024-02-17 20:54:32 +00:00
{
2024-03-09 21:13:41 +00:00
for ( int j = i + 1 ; j < columnNames . Count ; j + + )
2024-02-17 20:54:32 +00:00
{
2024-03-09 21:13:41 +00:00
if ( columnNames [ i ] = = columnNames [ j ] )
2024-02-17 20:54:32 +00:00
{
throw new Exception ( "Column names in header must be unique" ) ;
}
}
2024-03-09 21:13:41 +00:00
byte [ ] nameEncoded = Encoding . UTF8 . GetBytes ( columnNames [ i ] ) ;
2024-02-17 20:54:32 +00:00
for ( int j = 0 ; j < nameEncoded . Length ; j + + )
{
if ( nameEncoded [ j ] = = '\n' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) 'n' ) ;
}
else if ( nameEncoded [ j ] = = '\t' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) 't' ) ;
}
else if ( nameEncoded [ j ] = = '\\' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) '\\' ) ;
}
else if ( nameEncoded [ j ] = = '#' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) '#' ) ;
}
else
{
bytes . Add ( nameEncoded [ j ] ) ;
}
}
2024-02-25 19:24:30 +00:00
if ( tsvFormat ! = FormatType . SIMPLE_TSV )
2024-02-17 20:54:32 +00:00
{
2024-02-25 19:24:30 +00:00
bytes . Add ( ( byte ) ':' ) ;
try
{
2024-03-09 21:13:41 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( GetNameFromColumn ( columnTypes [ i ] ) ) ) ;
2024-02-25 19:24:30 +00:00
}
catch ( Exception e )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Invalid column type for column {i}" , e ) ;
2024-02-25 19:24:30 +00:00
}
2024-02-17 20:54:32 +00:00
}
2024-03-09 21:13:41 +00:00
if ( i = = columnNames . Count - 1 )
2024-02-17 20:54:32 +00:00
{
bytes . Add ( ( byte ) '\n' ) ;
}
else
{
bytes . Add ( ( byte ) '\t' ) ;
}
}
2024-02-26 06:35:56 +00:00
// Serialize data
2024-03-09 21:13:41 +00:00
SerializeTsv < T > ( data , bytes , columnPropertyInfos . ToArray ( ) , columnTypes . ToArray ( ) , tsvFormat , 0 , data . Count ) ;
2024-02-26 06:35:56 +00:00
return bytes . ToArray ( ) ;
}
2024-03-11 05:35:30 +00:00
protected static void SerializeTsv < T > ( IList < T > data , List < byte > bytes , PropertyInfo [ ] columnPropertyInfos , ColumnType [ ] columnTypes , FormatType tsvFormat , int startIndex , int endIndex )
2024-02-26 06:35:56 +00:00
{
2024-02-17 20:54:32 +00:00
// Serialize data
for ( int i = 0 ; i < data . Count ; i + + )
{
2024-03-09 21:13:41 +00:00
for ( int j = 0 ; j < columnTypes . Length ; j + + )
2024-02-17 20:54:32 +00:00
{
2024-03-09 21:13:41 +00:00
object datum = columnPropertyInfos [ j ] . GetValue ( data [ i ] ) ;
2024-02-20 22:30:01 +00:00
2024-02-17 20:54:32 +00:00
try
{
2024-02-18 04:44:07 +00:00
byte [ ] fieldEncoded = null ;
// Some fields definitely don't need escaping, so we add them directly to bytes
bool skipEscaping = false ;
2024-02-17 20:54:32 +00:00
2024-03-11 05:35:30 +00:00
if ( columnTypes [ j ] . GetType ( ) = = typeof ( StringType ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
fieldEncoded = Encoding . UTF8 . GetBytes ( ( string ) datum ) ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( BooleanType ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
bytes . AddRange ( ( bool ) datum ? TrueEncoded : FalseEncoded ) ;
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float32Type ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
if ( datum is float f )
2024-02-18 04:44:07 +00:00
{
if ( float . IsNegativeInfinity ( f ) )
{
bytes . AddRange ( Encoding . UTF8 . GetBytes ( "-inf" ) ) ;
}
else if ( float . IsPositiveInfinity ( f ) )
{
bytes . AddRange ( Encoding . UTF8 . GetBytes ( "+inf" ) ) ;
}
else
{
// See https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-numeric-format-strings#round-trip-format-specifier-r
2024-02-20 22:30:01 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( ( float ) datum ) . ToString ( "G9" ) ) ) ;
2024-02-18 04:44:07 +00:00
}
}
else
{
throw new InvalidCastException ( ) ;
}
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float32LEType ) )
2024-02-17 20:54:32 +00:00
{
if ( LittleEndian )
{
2024-02-20 22:30:01 +00:00
fieldEncoded = BitConverter . GetBytes ( ( float ) datum ) ;
2024-02-17 20:54:32 +00:00
}
else
{
2024-02-20 22:30:01 +00:00
byte [ ] floatBytes = BitConverter . GetBytes ( ( float ) datum ) ;
2024-02-17 20:54:32 +00:00
fieldEncoded = new byte [ sizeof ( float ) ] ;
for ( int k = 0 ; k < sizeof ( float ) ; k + + )
{
fieldEncoded [ k ] = floatBytes [ sizeof ( float ) - 1 - k ] ;
}
}
}
2024-03-16 01:58:07 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float64Type ) | | ( columnTypes [ j ] is PhysicalUnitsType f64PhUnit & & f64PhUnit . BaseType is Float64Type ) )
2024-02-17 20:54:32 +00:00
{
2024-03-16 01:58:07 +00:00
double value ;
2024-02-20 22:30:01 +00:00
if ( datum is double d )
2024-02-18 04:44:07 +00:00
{
2024-03-16 01:58:07 +00:00
value = d ;
}
// TODO: check units match
else if ( datum is UnitsNet . IQuantity quantity )
{
value = quantity . Value ;
2024-02-18 04:44:07 +00:00
}
else
{
throw new InvalidCastException ( ) ;
}
2024-03-16 01:58:07 +00:00
if ( double . IsNegativeInfinity ( value ) )
{
bytes . AddRange ( Encoding . UTF8 . GetBytes ( "-inf" ) ) ;
}
else if ( double . IsPositiveInfinity ( value ) )
{
bytes . AddRange ( Encoding . UTF8 . GetBytes ( "+inf" ) ) ;
}
else
{
// See https://learn.microsoft.com/en-us/dotnet/standard/base-types/standard-numeric-format-strings#round-trip-format-specifier-r
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( value ) . ToString ( "G17" ) ) ) ;
}
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-16 01:58:07 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Float64LEType ) | | ( columnTypes [ j ] is PhysicalUnitsType f64LEPhUnit & & f64LEPhUnit . BaseType is Float64LEType ) )
2024-02-17 20:54:32 +00:00
{
2024-03-16 01:58:07 +00:00
double value ;
if ( datum is double d )
{
value = d ;
}
// TODO: check units match
else if ( datum is UnitsNet . IQuantity quantity )
{
value = quantity . Value ;
}
else
{
throw new InvalidCastException ( ) ;
}
2024-02-17 20:54:32 +00:00
if ( LittleEndian )
{
2024-03-16 01:58:07 +00:00
fieldEncoded = BitConverter . GetBytes ( ( double ) value ) ;
2024-02-17 20:54:32 +00:00
}
else
{
2024-03-16 01:58:07 +00:00
byte [ ] doubleBytes = BitConverter . GetBytes ( ( double ) value ) ;
2024-02-17 20:54:32 +00:00
fieldEncoded = new byte [ sizeof ( double ) ] ;
for ( int k = 0 ; k < sizeof ( double ) ; k + + )
{
fieldEncoded [ k ] = doubleBytes [ sizeof ( double ) - 1 - k ] ;
}
}
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( UInt32Type ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( ( UInt32 ) datum ) . ToString ( ) ) ) ;
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( UInt64Type ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( ( UInt64 ) datum ) . ToString ( ) ) ) ;
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Int32Type ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( ( Int32 ) datum ) . ToString ( ) ) ) ;
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Int64Type ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
bytes . AddRange ( Encoding . UTF8 . GetBytes ( ( ( Int64 ) datum ) . ToString ( ) ) ) ;
2024-02-18 04:44:07 +00:00
skipEscaping = true ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:35:30 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( BinaryType ) )
2024-02-17 20:54:32 +00:00
{
2024-02-20 22:30:01 +00:00
fieldEncoded = ( byte [ ] ) datum ;
2024-02-17 20:54:32 +00:00
}
2024-03-11 05:43:11 +00:00
else if ( columnTypes [ j ] . GetType ( ) = = typeof ( Iso8601Type ) )
{
fieldEncoded = Encoding . UTF8 . GetBytes ( ( ( DateTime ) datum ) . ToString ( "yyyy-MM-ddTHH:mm:ss.ffff" ) ) ;
}
2024-03-16 01:58:07 +00:00
else if ( columnTypes [ j ] is PhysicalUnitsType phUnits )
{
throw new NotImplementedException ( $"Physical units types don't support {GetNameFromColumn(phUnits.BaseType)} as a base type" ) ;
}
2024-02-17 20:54:32 +00:00
else
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Unexpected column type {columnTypes[j]} for column {j}" ) ;
2024-02-17 20:54:32 +00:00
}
2024-02-18 04:44:07 +00:00
if ( ! skipEscaping )
2024-02-17 20:54:32 +00:00
{
2024-02-18 04:44:07 +00:00
for ( int k = 0 ; k < fieldEncoded . Length ; k + + )
2024-02-17 20:54:32 +00:00
{
2024-02-18 04:44:07 +00:00
if ( fieldEncoded [ k ] = = '\n' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) 'n' ) ;
}
else if ( fieldEncoded [ k ] = = '\t' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) 't' ) ;
}
else if ( fieldEncoded [ k ] = = '\\' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) '\\' ) ;
}
else if ( fieldEncoded [ k ] = = '#' )
{
bytes . Add ( ( byte ) '\\' ) ;
bytes . Add ( ( byte ) '#' ) ;
}
else
{
bytes . Add ( fieldEncoded [ k ] ) ;
}
2024-02-17 20:54:32 +00:00
}
}
2024-03-09 21:13:41 +00:00
if ( j < columnTypes . Length - 1 )
2024-02-17 20:54:32 +00:00
{
2024-02-18 04:44:07 +00:00
bytes . Add ( ( byte ) '\t' ) ;
2024-02-17 20:54:32 +00:00
}
2024-02-18 04:44:07 +00:00
else if ( i < data . Count - 1 )
2024-02-17 20:54:32 +00:00
{
2024-02-18 04:44:07 +00:00
bytes . Add ( ( byte ) '\n' ) ;
2024-02-17 20:54:32 +00:00
}
}
catch ( InvalidCastException e )
{
2024-03-09 21:13:41 +00:00
throw new Exception ( $"Record {i}, field {j} expected type compatible with {GetNameFromColumn(columnTypes[j])}" , e ) ;
2024-02-17 20:54:32 +00:00
}
}
}
}
2024-02-20 22:30:01 +00:00
public class SimpleTsvRecord
2024-02-14 03:15:07 +00:00
{
2024-02-20 05:03:48 +00:00
public string [ ] ColumnNames { get ; }
2024-02-15 02:31:58 +00:00
public string Comment { get ; }
2024-02-20 22:30:01 +00:00
public string [ ] Fields { get ; }
public int? Line { get ; }
2024-02-14 03:15:07 +00:00
2024-02-20 22:30:01 +00:00
public string this [ string columnName ] = > Fields [ Array . IndexOf ( ColumnNames , columnName ) ] ;
public string this [ int columnIndex ] = > Fields [ columnIndex ] ;
2024-02-16 04:24:01 +00:00
2024-02-20 22:30:01 +00:00
public SimpleTsvRecord ( string [ ] columnNames , string [ ] fields , string comment , int line )
2024-02-14 03:15:07 +00:00
{
2024-02-20 05:03:48 +00:00
ColumnNames = columnNames ;
2024-02-14 03:15:07 +00:00
Fields = fields ;
2024-02-15 02:31:58 +00:00
Comment = comment ;
2024-02-16 04:24:01 +00:00
Line = line ;
2024-02-14 03:15:07 +00:00
}
}
2024-02-20 22:30:01 +00:00
2024-02-23 06:09:13 +00:00
public class TsvRecord
2024-02-20 22:30:01 +00:00
{
public int? Line { get ; set ; }
2024-02-23 06:09:13 +00:00
public TsvRecord ( int? line )
{
Line = line ;
}
public TsvRecord ( ) { }
}
2024-02-20 22:30:01 +00:00
2024-02-23 06:09:13 +00:00
public class CommentedTsvRecord : TsvRecord
{
public string Comment { get ; set ; }
public CommentedTsvRecord ( string comment , int? line )
2024-02-20 22:30:01 +00:00
{
Comment = comment ;
Line = line ;
}
2024-02-23 06:09:13 +00:00
public CommentedTsvRecord ( ) { }
2024-02-20 22:30:01 +00:00
}
2024-02-23 06:09:13 +00:00
public class TestRecord : CommentedTsvRecord
2024-02-20 22:30:01 +00:00
{
2024-02-23 06:09:13 +00:00
[TypedTsvColumn("my-column")]
2024-02-20 22:30:01 +00:00
public string MyColumn { get ; set ; }
}
// TODO: Add column ordering
2024-02-23 06:09:13 +00:00
public class TsvColumnAttribute : Attribute
2024-02-20 22:30:01 +00:00
{
public string ColumnName { get ; }
2024-03-11 05:35:30 +00:00
public virtual ColumnType ColumnType { get ; }
2024-02-20 22:30:01 +00:00
2024-02-23 07:16:35 +00:00
public TsvColumnAttribute ( )
2024-02-20 22:30:01 +00:00
{
2024-03-11 05:35:30 +00:00
ColumnType = new StringType ( ) ;
2024-02-23 06:09:13 +00:00
}
2024-02-23 07:16:35 +00:00
public TsvColumnAttribute ( string columnName )
2024-02-23 06:09:13 +00:00
{
2024-03-11 05:35:30 +00:00
ColumnType = new StringType ( ) ;
2024-02-20 22:30:01 +00:00
ColumnName = columnName ;
}
2024-02-23 06:09:13 +00:00
}
2024-02-20 22:30:01 +00:00
2024-02-23 06:09:13 +00:00
// TODO: Add column ordering
public class TypedTsvColumnAttribute : TsvColumnAttribute
{
2024-03-11 05:35:30 +00:00
public override ColumnType ColumnType { get ; }
2024-02-23 06:09:13 +00:00
public TypedTsvColumnAttribute ( ) { }
public TypedTsvColumnAttribute ( string columnName ) : base ( columnName ) { }
2024-03-11 05:35:30 +00:00
public TypedTsvColumnAttribute ( string columnName , string columnType ) : base ( columnName )
2024-02-20 22:30:01 +00:00
{
2024-03-11 05:35:30 +00:00
ColumnType = GetColumnFromString ( columnType ) ;
2024-02-20 22:30:01 +00:00
}
2024-03-11 05:35:30 +00:00
public TypedTsvColumnAttribute ( ColumnType columnType )
2024-02-20 22:30:01 +00:00
{
ColumnType = columnType ;
}
}
2024-03-16 01:58:07 +00:00
public static UnitInfo ParseUnit ( string unitName )
{
// Find all unit enum types in the UnitsNet namespace
var unitEnumTypes = Assembly . GetAssembly ( typeof ( LengthUnit ) )
. GetTypes ( )
. Where ( t = > t . IsEnum & & t . Namespace = = typeof ( LengthUnit ) . Namespace ) ;
foreach ( var unitEnumType in unitEnumTypes )
{
if ( UnitParser . Default . TryParse ( unitName , unitEnumType , out Enum unitEnum ) )
{
// Successfully parsed the abbreviation, retrieve UnitInfo
return Quantity . GetUnitInfo ( unitEnum ) ;
}
}
throw new ArgumentException ( $"Unable to parse unit abbreviation: {unitName}" ) ;
}
2024-02-14 03:15:07 +00:00
}