Compare commits

...

3 Commits

Author SHA1 Message Date
Nathan McRae
53e87e2f7f Add more ideas for ExtraTSV 2024-02-16 17:20:21 -08:00
Nathan McRae
f3ed173842 Add binary floating-point types 2024-02-16 17:20:11 -08:00
Nathan McRae
f392036982 Fix some spelling 2024-02-15 20:29:45 -08:00
2 changed files with 36 additions and 3 deletions

View File

@ -12,7 +12,9 @@ public class SaneTsv
public class StringType : ColumnType { } public class StringType : ColumnType { }
public class BooleanType : ColumnType { } public class BooleanType : ColumnType { }
public class Float32Type : ColumnType { } public class Float32Type : ColumnType { }
public class Float32LEType : ColumnType { }
public class Float64Type : ColumnType { } public class Float64Type : ColumnType { }
public class Float64LEType : ColumnType { }
public class UInt32Type : ColumnType { } public class UInt32Type : ColumnType { }
public class UInt64Type : ColumnType { } public class UInt64Type : ColumnType { }
public class Int32Type : ColumnType { } public class Int32Type : ColumnType { }
@ -166,9 +168,15 @@ public class SaneTsv
case "float32": case "float32":
type = typeof(Float32Type); type = typeof(Float32Type);
break; break;
case "float32-le":
type = typeof(Float32LEType);
break;
case "float64": case "float64":
type = typeof(Float64Type); type = typeof(Float64Type);
break; break;
case "float64-le":
type = typeof(Float64LEType);
break;
case "uint32": case "uint32":
type = typeof(UInt32Type); type = typeof(UInt32Type);
break; break;
@ -339,6 +347,13 @@ public class SaneTsv
parsedFields[j] = parsedFloat; parsedFields[j] = parsedFloat;
} }
else if (parsed.ColumnTypes[j] == typeof(Float32LEType))
{
throw new NotImplementedException();
// TODO: Implement and do byte-swapping if necessary
//parsedFields[j] = parsedFloat;
}
else if (parsed.ColumnTypes[j] == typeof(Float64Type)) else if (parsed.ColumnTypes[j] == typeof(Float64Type))
{ {
if (!double.TryParse(fieldString, out double parsedDouble)) if (!double.TryParse(fieldString, out double parsedDouble))
@ -348,6 +363,13 @@ public class SaneTsv
parsedFields[j] = parsedDouble; parsedFields[j] = parsedDouble;
} }
else if (parsed.ColumnTypes[j] == typeof(Float64LEType))
{
throw new NotImplementedException();
// TODO: Implement and do byte-swapping if necessary
//parsedFields[j] = parsedFloat;
}
else if (parsed.ColumnTypes[j] == typeof(UInt32Type)) else if (parsed.ColumnTypes[j] == typeof(UInt32Type))
{ {
if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32)) if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))

View File

@ -8,7 +8,7 @@ Simple TSV is a strict format for tabular data.
'\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line. '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.
'\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionaly, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv). '\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionally, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv).
All fields must be UTF-8 encoded text. All escaping can be done before decoding (and after encoding). All fields must be UTF-8 encoded text. All escaping can be done before decoding (and after encoding).
@ -29,7 +29,9 @@ Typed TSV builds on Simple TSV to allow for typing of columns. All column names
- 'string' - 'string'
- 'boolean' - 'boolean'
- 'float32' - 'float32'
- 'float32-le'
- 'float64' - 'float64'
- 'float64-le'
- 'uint32' - 'uint32'
- 'uint64' - 'uint64'
- 'int32' - 'int32'
@ -40,7 +42,7 @@ Any other values are an error, however, the portion of the name prior to the las
All fields in the rest of the file must be of the type corresponding to their column. All fields in the rest of the file must be of the type corresponding to their column.
Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions: Aside from the 'binary', 'float32-le', and 'float64-le' column types, all fields must be UTF-8 encoded text. Each type has the following restrictions:
- 'boolean' fields must contain only and exactly the text "TRUE" or "FALSE". - 'boolean' fields must contain only and exactly the text "TRUE" or "FALSE".
- 'float32' and 'float64' correspond to single and double precision IEEE 754 floating-point numbers respectively. They should be formatted like this regex: `-?[0-9]\.([0-9]|[0-9]+[1-9])E-?[1-9][0-9]*` - 'float32' and 'float64' correspond to single and double precision IEEE 754 floating-point numbers respectively. They should be formatted like this regex: `-?[0-9]\.([0-9]|[0-9]+[1-9])E-?[1-9][0-9]*`
@ -50,6 +52,9 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
- 'qNaN' - 'qNaN'
- '+inf' - '+inf'
- '-inf' - '-inf'
- 'float32-le' and 'float64-le' are also IEEE 754 floating-point, but are stored as binary. They must always be stored in little-endian order.
The reason for having a separate binary format for them is that round-tripping floating-point text values between different parsers is not likely to work for all cases. The text-based format should be fine for general use, but when exact value transfer is needed, the binary formats are available.
- 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*` - 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
- 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed) - 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)
@ -92,5 +97,11 @@ Extended formats may still use the .ctsv extension, though they could use a dedi
- Physical units - Physical units
- Multiformats - Multiformats
- Instead of multihashes, maybe have a column type for each hash type. That way we can avoid wasting data on the type within each field.
- ISO 8601 - ISO 8601
- https://github.com/multiformats/unsigned-varint - https://github.com/multiformats/unsigned-varint
- Color codes (e.g. #E359FF)
- Both binary and string-based
- JSON
- XML
- URL