Add more ideas for ExtraTSV

Add binary floating-point types
Fix some spelling
2024-02-16 17:20:21 -08:00 · 2024-02-16 17:20:11 -08:00 · 2024-02-15 20:29:45 -08:00
2 changed files with 36 additions and 3 deletions
--- a/SaneTsv/SaneTsv.cs
+++ b/SaneTsv/SaneTsv.cs
@@ -12,7 +12,9 @@ public class SaneTsv
  public class StringType : ColumnType { }
  public class BooleanType : ColumnType { }
  public class Float32Type : ColumnType { }
+  public class Float32LEType : ColumnType { }
  public class Float64Type : ColumnType { }
+  public class Float64LEType : ColumnType { }
  public class UInt32Type : ColumnType { }
  public class UInt64Type : ColumnType { }
  public class Int32Type : ColumnType { }
@@ -166,9 +168,15 @@ public class SaneTsv
              case "float32":
                type = typeof(Float32Type);
                break;
+              case "float32-le":
+                type = typeof(Float32LEType);
+                break;
              case "float64":
                type = typeof(Float64Type);
                break;
+              case "float64-le":
+                type = typeof(Float64LEType);
+                break;
              case "uint32":
                type = typeof(UInt32Type);
                break;
@@ -339,6 +347,13 @@ public class SaneTsv

        parsedFields[j] = parsedFloat;
      }
+      else if (parsed.ColumnTypes[j] == typeof(Float32LEType))
+      {
+        throw new NotImplementedException();
+        // TODO: Implement and do byte-swapping if necessary
+
+        //parsedFields[j] = parsedFloat;
+      }
      else if (parsed.ColumnTypes[j] == typeof(Float64Type))
      {
        if (!double.TryParse(fieldString, out double parsedDouble))
@@ -348,6 +363,13 @@ public class SaneTsv

        parsedFields[j] = parsedDouble;
      }
+      else if (parsed.ColumnTypes[j] == typeof(Float64LEType))
+      {
+        throw new NotImplementedException();
+        // TODO: Implement and do byte-swapping if necessary
+
+        //parsedFields[j] = parsedFloat;
+      }
      else if (parsed.ColumnTypes[j] == typeof(UInt32Type))
      {
        if (!UInt32.TryParse(fieldString, out UInt32 parsedUInt32))
--- a/SaneTsv/readme.md
+++ b/SaneTsv/readme.md
@@ -8,7 +8,7 @@ Simple TSV is a strict format for tabular data.

 '\n' (0x0A) character delimit lines, and '\t' (0x09) characters delimit fields within a line.

-'\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionaly, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv).
+'\n' and '\t' characters are allowed within fields by escaping them with a backslash character (0x5C) followed by 'n' (0x6E) and 't' (0x74) respectively. Additionally, '\\' and '#' (0x23) must also be escaped. The '#' character is escaped for compatility with [Commented TSVs](#commented-tsv).

 All fields must be UTF-8 encoded text. All escaping can be done before decoding (and after encoding). 

@@ -29,7 +29,9 @@ Typed TSV builds on Simple TSV to allow for typing of columns. All column names
 - 'string'
 - 'boolean'
 - 'float32'
+- 'float32-le'
 - 'float64'
+- 'float64-le'
 - 'uint32'
 - 'uint64'
 - 'int32'
@@ -40,7 +42,7 @@ Any other values are an error, however, the portion of the name prior to the las

 All fields in the rest of the file must be of the type corresponding to their column.

-Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each type has the following restrictions:
+Aside from the 'binary', 'float32-le', and 'float64-le' column types, all fields must be UTF-8 encoded text. Each type has the following restrictions:

 - 'boolean' fields must contain only and exactly the text "TRUE" or "FALSE".
 - 'float32' and 'float64' correspond to single and double precision IEEE 754 floating-point numbers respectively. They should be formatted like this regex: `-?[0-9]\.([0-9]|[0-9]+[1-9])E-?[1-9][0-9]*`
@@ -50,6 +52,9 @@ Aside from the 'binary' column type, all fields must be UTF-8 encoded text. Each
    - 'qNaN'
    - '+inf'
    - '-inf'
+- 'float32-le' and 'float64-le' are also IEEE 754 floating-point, but are stored as binary. They must always be stored in little-endian order. 
+
+  The reason for having a separate binary format for them is that round-tripping floating-point text values between different parsers is not likely to work for all cases. The text-based format should be fine for general use, but when exact value transfer is needed, the binary formats are available.
 - 'uint32' and 'uint64' are unsigned 32 and 64 bit integers respectively. They should be formatted like this regex: `[1-9][0-9]*`
 - 'int32' and 'int64' are signed 32 and 64 bit integers respectively. They should be formatted like this regex: `-?[1-9][0-9]*` (except that '-0' is not allowed)

@@ -92,5 +97,11 @@ Extended formats may still use the .ctsv extension, though they could use a dedi

 - Physical units
 - Multiformats
+  - Instead of multihashes, maybe have a column type for each hash type. That way we can avoid wasting data on the type within each field.
 - ISO 8601
 - https://github.com/multiformats/unsigned-varint
+- Color codes (e.g. #E359FF)
+  - Both binary and string-based
+- JSON
+- XML
+- URL
Author	SHA1	Message	Date
Nathan McRae	53e87e2f7f	Add more ideas for ExtraTSV	2024-02-16 17:20:21 -08:00
Nathan McRae	f3ed173842	Add binary floating-point types	2024-02-16 17:20:11 -08:00
Nathan McRae	f392036982	Fix some spelling	2024-02-15 20:29:45 -08:00