commit 5287ec7c5a58a8a00776b8de4331cc4b5ea490e6
Author: Morel BĂ©renger <berenger.morel@neutralite.org>
Date: Tue, 14 May 2019 01:25:38 +0200
initial commit: lot of cleanup and tests to do
Diffstat:
A | merge.cpp | | | 281 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 281 insertions(+), 0 deletions(-)
diff --git a/merge.cpp b/merge.cpp
@@ -0,0 +1,281 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include <algorithm>
+
+#include <vector>
+
+/**
+ * This program reads stdin and when consecutive lines have specific fields all
+ * containing the same value, prints them replacing the newline character by
+ * the 1st character in FIELD_SEP.
+ * Fields are delimited by the FIELD_SEP environment variable. If not defined,
+ * " \\t" is used instead (see isblank(3)).
+ * Fields to use are defined by the environment variable FIELDS, which only
+ * use unsigned decimal integers separated by commas, other characters makes the
+ * value invalid.
+ * If FIELDS is not defined or invalid, exits with an error.
+ * Empty field indexes ("1,,3") are ignored (will resolve in "1,3").
+ * Do not work if input is not in line mode.
+ * Line separator is defined by ENTRY_SEP, or "\\n" if not defined.
+ *
+ * TODO:
+ * * UTF-8 support (field separators);
+ * * providing FIELDS variable as command-line option;
+ * * -v/--version option;
+ * * -h/--help option;
+ * * remove bloated STL containers;
+ * * allow to customize the memory allocation scheme at runtime;
+ * * allow to not print twice merged fields;
+ * * allow to set verbosity on stderr;
+ * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions;
+ *
+ * Coding rules:
+ * * const affect what is before it, so it must follow the type;
+ **/
+
+class field_marker
+{
+ uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX;
+
+public:
+ bool ignore( void ) const
+ {
+ return m_start == m_end && m_start == UINT16_MAX;
+ }
+
+ void define( uint16_t start, uint16_t end )
+ {
+ assert( end >= start && start != UINT8_MAX && end != UINT8_MAX );
+ m_start = start; m_end = end;
+ }
+
+ uint16_t start( void ) const
+ {
+ assert( !ignore() );
+ return m_start;
+ }
+
+ uint16_t end( void ) const
+ {
+ assert( !ignore() );
+ return m_end;
+ }
+};
+
+bool allocate_markers(
+ char const * const FIELDS,
+ std::vector<field_marker>& field_cache
+);
+
+int main( void )
+{
+ char const * const DEFAULT_FIELD_SEP = " \t";
+ char const * const DEFAULT_ENTRY_SEP = "\n";
+
+ char const * SEP_START = getenv( "FIELD_SEP" );
+ if( !SEP_START )
+ {
+ SEP_START = DEFAULT_FIELD_SEP;
+ }
+ char const * SEP_ENTRY = getenv( "ENTRY_SEP" );
+ if( !SEP_ENTRY )
+ {
+ SEP_ENTRY = DEFAULT_ENTRY_SEP;
+ }
+
+ char const * const FIELDS = getenv( "FIELDS" );
+ if( !FIELDS )
+ {
+ fputs( "ERROR: FIELDS is not defined\n", stderr );
+ return EXIT_FAILURE;
+ }
+
+ if( strlen( FIELDS ) == 0 )
+ {
+ fputs( "ERROR: FIELDS is empty\n", stderr );
+ return EXIT_FAILURE;
+ }
+
+ std::vector<field_marker> field_cache;
+ if( allocate_markers( FIELDS, field_cache ) )
+ {
+ return EXIT_FAILURE;
+ }
+
+ size_t buf_sz = 2048;
+ char* buf = nullptr;
+
+ // I don't see how merging lines smaller than 16 bytes can be useful
+ // also, not even enough mem for that would indicate bigger problems...
+ while( !buf && buf_sz >= 32 )
+ {
+ buf_sz /= 2;
+ buf = static_cast<char*>( malloc( buf_sz ) );
+ }
+
+ if( !buf )
+ {
+ fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno );
+ return EXIT_FAILURE;
+ }
+
+ bool fetch = true;
+ typedef std::vector<char> line_cache;
+ line_cache last_line;
+ char const * const SEP_END = SEP_START + strlen( SEP_START );
+ while( !feof( stdin ) )
+ {
+ if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) )
+ {
+ free( buf );
+ buf = nullptr;
+ buf_sz = 0;
+ if( !feof( stdin ) )
+ {
+ fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno );
+ return EXIT_FAILURE;
+ }
+ break;
+ }
+
+ size_t str_sz = strlen( buf );
+ if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) )
+ {
+ fprintf( stderr, "ERROR: buffer too small for some lines\n" );
+ return EXIT_FAILURE;
+ }
+
+ if( !fetch )
+ {
+ char const* dst_ptr = buf;
+ for( size_t i = 0; i < field_cache.size(); ++i )
+ {
+ field_marker const& src = field_cache[i];
+ if( src.ignore() )
+ {
+ char const *sep = SEP_END;
+ while( sep == SEP_END )
+ {
+ ++dst_ptr;
+ sep = SEP_START;
+ for( ; sep != SEP_END && *dst_ptr != *sep; ++sep ){}
+ }
+ ++dst_ptr;
+ continue;
+ }
+ char const * src_ptr = last_line.data() + src.start();
+ size_t len = src.end() - src.start();
+ if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) )
+ {
+ fetch = true;
+ fputs( "mismatch 1\n", stderr );
+ break;
+ }
+
+ if( 0 != memcmp( dst_ptr, src_ptr, len ) )
+ {
+ fetch = true;
+ fputs( "mismatch 3\n", stderr );
+ break;
+ }
+
+ char last = dst_ptr[len];
+ char const * sep_ = SEP_START;
+ assert( sep_ != nullptr );
+ while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last )
+ {
+ ++sep_;
+ }
+ if( 0 == *sep_ )
+ {
+ fputs( "mismatch 2\n", stderr );
+ fetch = true;
+ break;
+ }
+ dst_ptr += len;
+ assert( dst_ptr >= buf );
+ }
+
+ fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout );
+ }
+
+ //2nd call to strlen for nothing
+ last_line.assign( buf, buf + strlen( buf ) );
+ if( last_line.back() == *SEP_ENTRY )
+ {
+ last_line.back() = 0;
+ }
+
+ if( fetch )
+ {
+ line_cache::iterator start = last_line.begin();
+ line_cache::iterator cache_end = last_line.end();
+ size_t field_index = 0;
+ while( start != cache_end && field_index < field_cache.size() )
+ {
+ auto end = last_line.end();
+ if( *end == 0 )
+ {
+ --end;
+ }
+ line_cache::iterator it = std::find_first_of
+ (
+ start, end,
+ SEP_START, SEP_END
+ );
+ if( !field_cache[field_index].ignore() )
+ {
+ field_cache[field_index].define(
+ static_cast<uint16_t>( start - last_line.begin() ),
+ static_cast<uint16_t>( it - last_line.begin() )
+ );
+ }
+ start = it + 1;
+ ++field_index;
+ }
+ fetch = false;
+ }
+ fputs( last_line.data(), stdout );
+ }
+ fputc( *SEP_ENTRY, stdout );
+ return EXIT_SUCCESS;
+}
+
+bool allocate_markers(
+ char const * const FIELDS,
+ std::vector<field_marker>& field_cache
+)
+{
+ field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases
+ size_t last_field = 0;
+
+ char const * fields = FIELDS - 1;
+ do
+ {
+ ++fields;
+ if( isdigit( *fields ) )
+ {
+ last_field = last_field * 10 + static_cast<size_t>( *fields - '0' );
+ }
+ else if( *fields == ',' || *fields == 0 )
+ {
+ size_t max = std::max( field_cache.size(), last_field );
+ field_cache.resize( max );
+ field_cache[last_field - 1].define( 0, 0 );
+ last_field = 0;
+ }
+ else
+ {
+ fputs( "ERROR: FIELDS contains illegal characters\n", stderr );
+ return true;
+ }
+ }while( *fields );
+ field_cache.shrink_to_fit();
+ return false;
+}