tools

various tools
git clone git://deadbeef.fr/tools.git
Log | Files | Refs | README | LICENSE

lmerge.cpp (9336B)


      1 // Copyright (c) 2020 Morel BĂ©renger
      2 // 
      3 // This software is provided 'as-is', without any express or implied
      4 // warranty. In no event will the authors be held liable for any damages
      5 // arising from the use of this software.
      6 // 
      7 // Permission is granted to anyone to use this software for any purpose,
      8 // including commercial applications, and to alter it and redistribute it
      9 // freely, subject to the following restrictions:
     10 // 
     11 // 1. The origin of this software must not be misrepresented; you must not
     12 //    claim that you wrote the original software. If you use this software
     13 //    in a product, an acknowledgment in the product documentation would be
     14 //    appreciated but is not required.
     15 // 2. Altered source versions must be plainly marked as such, and must not be
     16 //    misrepresented as being the original software.
     17 // 3. This notice may not be removed or altered from any source distribution.
     18 
     19 #ifdef LIBCPP_MUSL_STATIC
     20 #define __GLIBC_PREREQ(x,y) 0
     21 #endif
     22 
     23 #include <stdlib.h>
     24 #include <stdio.h>
     25 #include <errno.h>
     26 #include <string.h>
     27 #include <assert.h>
     28 #include <stdint.h>
     29 #include <ctype.h>
     30 
     31 #include <unistd.h>
     32 
     33 #include <algorithm>
     34 #include <iterator>
     35 
     36 /**
     37  * TODO:
     38  * * check that ENTRY_SEP works as expected;
     39  * * fix the fact input needs a "\\n" at end of last line for it to be merged;
     40  * * UTF-8 support (field separators);
     41  * * -v/--version option;
     42  * * allow to customize the memory allocation scheme at runtime;
     43  * * allow to not print twice merged fields;
     44  * * allow to set verbosity on stderr;
     45  * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions;
     46  * * print as many lines as there where duplicates?
     47  *
     48  * Coding rules:
     49  * * const affect what is before it, so it must follow the type;
     50  **/
     51 
     52 #include <vector.hpp>
     53 #include <optparser.hpp>
     54 
     55 class field_marker;
     56 typedef vector<char> line_cache;
     57 typedef vector<field_marker> field_marker_t;
     58 
     59 class field_marker
     60 {
     61 	uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX;
     62 
     63 public:
     64 	bool ignore( void ) const
     65 	{
     66 		return m_start == m_end && m_start == UINT16_MAX;
     67 	}
     68 
     69 	void define( uint16_t start, uint16_t end )
     70 	{
     71 		assert( end >= start && start != UINT8_MAX && end != UINT8_MAX );
     72 		m_start = start; m_end = end;
     73 	}
     74 
     75 	uint16_t start( void ) const
     76 	{
     77 		assert( !ignore() );
     78 		return m_start;
     79 	}
     80 
     81 	uint16_t end( void ) const
     82 	{
     83 		assert( !ignore() );
     84 		return m_end;
     85 	}
     86 };
     87 
     88 bool allocate_markers(
     89 		char const * const FIELDS,
     90 		field_marker_t& field_cache
     91 );
     92 
     93 void print_help( char const* pgm, FILE* target, opt_desc_t const* start, opt_desc_t const* end );
     94 
     95 int main( int argc, char **argv )
     96 {
     97 	char const * SEP_START = getenv( "FIELD_SEP" );
     98 	char const * SEP_ENTRY = getenv( "ENTRY_SEP" );
     99 	char const * FIELDS    = getenv( "FIELDS" );
    100 	if( !SEP_START )
    101 	{
    102 		SEP_START = " \t";
    103 	}
    104 	if( !SEP_ENTRY )
    105 	{
    106 		SEP_ENTRY = "\n";
    107 	}
    108 
    109 	opt_desc_t opts[] =
    110 	{
    111 		STD_HELP,
    112 #ifndef NO_CMDLINE
    113 		{ "field_sep", "field separator"  , 't', 0, &SEP_START, set<char const**>, show<char*> },
    114 		{ "entry_sep", "entry separator"  , 'l', 0, &SEP_ENTRY, set<char const**>, show<char*> },
    115 		{ "fields"   , "fields to compare", 'f', 0, &FIELDS   , set<char const**>, show<char*> },
    116 #endif
    117 	};
    118 	auto b_opts = std::begin( opts );
    119 	auto e_opts = std::end( opts );
    120 
    121 	char **arg = &argv[1]; assert( argc > 0 );
    122 	for( int iarg = 1; iarg != argc; ++iarg, ++arg )
    123 	{
    124 		auto error = parse_cmd_opt( *arg, b_opts, e_opts );
    125 		switch( error )
    126 		{
    127 			case MAX_COUNT:
    128 				arg_warning( *arg, error );
    129 				break;
    130 			case NONE:
    131 			case IGNORED:
    132 				break;
    133 			case SET_NO_VAL:
    134 			case SET_VAL_IGN:
    135 			case SET_FAIL:
    136 			case BAD_ARGS:
    137 			case BAD_SETTER:
    138 				print_help( argv[0], stderr, b_opts, e_opts );
    139 				arg_error( *arg, error );
    140 				return EXIT_FAILURE;
    141 		}
    142 	}
    143 
    144 	if( opts[0].count )
    145 	{
    146 		print_help( argv[0], stdout, b_opts, e_opts );
    147 		return EXIT_SUCCESS;
    148 	}
    149 
    150 	if( !FIELDS )
    151 	{
    152 		fputs( "ERROR: FIELDS is not defined\n", stderr );
    153 		return EXIT_FAILURE;
    154 	}
    155 
    156 	if( strlen( FIELDS ) == 0 )
    157 	{
    158 		fputs( "ERROR: FIELDS is empty\n", stderr );
    159 		return EXIT_FAILURE;
    160 	}
    161 
    162 	field_marker_t field_cache;
    163 	if( allocate_markers( FIELDS, field_cache ) )
    164 	{
    165 		return EXIT_FAILURE;
    166 	}
    167 
    168 	size_t buf_sz = 2048;
    169 	char* buf = nullptr;
    170 
    171 	// allocating a cache of at least 16 bytes.
    172 	// Note: I don't see how merging lines smaller than 16 bytes can be useful
    173 	// also, not even enough mem for that would indicate bigger problems...
    174 	while( !buf && buf_sz >= 32 )
    175 	{
    176 		buf_sz /= 2;
    177 		char* nbuf = static_cast<char*>( realloc( buf, buf_sz ) );
    178 		if( !nbuf )
    179 		{
    180 			free( buf );
    181 			return EXIT_FAILURE;
    182 		}
    183 		buf = nbuf;
    184 	}
    185 
    186 	if( !buf )
    187 	{
    188 		fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno );
    189 		free( buf );
    190 		return EXIT_FAILURE;
    191 	}
    192 
    193 	bool fetch = true;
    194 	line_cache last_line;
    195 	char const * const SEP_END = SEP_START + strlen( SEP_START );
    196 	while( !feof( stdin ) )
    197 	{
    198 		if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) )
    199 		{
    200 			free( buf );
    201 			buf = nullptr;
    202 			if( !feof( stdin ) )
    203 			{
    204 				fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno );
    205 				free( buf );
    206 				return EXIT_FAILURE;
    207 			}
    208 			break;
    209 		}
    210 
    211 		size_t str_sz = strlen( buf );
    212 		if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) )
    213 		{
    214 			fprintf( stderr, "ERROR: buffer too small for some lines\n" );
    215 			free( buf );
    216 			return EXIT_FAILURE;
    217 		}
    218 		
    219 		if( !fetch )
    220 		{
    221 			char const* dst_ptr = buf;
    222 			for( size_t i = 0; i < field_cache.size(); ++i )
    223 			{
    224 				field_marker const& src = field_cache[i];
    225 				if( src.ignore() )
    226 				{
    227 					char const *sep = SEP_END;
    228 					while( sep == SEP_END )
    229 					{
    230 						++dst_ptr;
    231 						sep = SEP_START;
    232 						for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){}
    233 					}
    234 					++dst_ptr;
    235 					continue;
    236 				}
    237 				char const * src_ptr = last_line.data() + src.start();
    238 				size_t len = src.end() - src.start();
    239 				if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) )
    240 				{
    241 					fetch = true;
    242 					break;
    243 				}
    244 
    245 				if( 0 != memcmp( dst_ptr, src_ptr, len ) )
    246 				{
    247 					fetch = true;
    248 					break;
    249 				}
    250 
    251 				char last = dst_ptr[len];
    252 				char const * sep_ = SEP_START;
    253 				assert( sep_ != nullptr );
    254 				while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last )
    255 				{
    256 					++sep_;
    257 				}
    258 				if( 0 == *sep_ )
    259 				{
    260 					fetch = true;
    261 					break;
    262 				}
    263 				dst_ptr += len;
    264 				assert( dst_ptr >= buf );
    265 			}
    266 
    267 			fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout );
    268 		}
    269 
    270 		last_line.assign( buf, buf + str_sz );
    271 		if( last_line.back() == *SEP_ENTRY )
    272 		{
    273 			last_line.back() = 0;
    274 		}
    275 
    276 		if( fetch )
    277 		{
    278 			line_cache::iterator start     = last_line.begin();
    279 			line_cache::iterator cache_end = last_line.end();
    280 			size_t field_index = 0;
    281 			while( start != cache_end && field_index < field_cache.size() )
    282 			{
    283 				auto end = last_line.end();
    284 				if( last_line.back() == 0 )
    285 				{
    286 					--end;
    287 				}
    288 				line_cache::iterator it = std::find_first_of
    289 					(
    290 					 start, end,
    291 					 SEP_START, SEP_END
    292 					);
    293 				if( !field_cache[field_index].ignore() )
    294 				{
    295 					field_cache[field_index].define(
    296 							static_cast<uint16_t>( start - last_line.begin() ),
    297 							static_cast<uint16_t>( it - last_line.begin() )
    298 							);
    299 				}
    300 				start = it + 1;
    301 				++field_index;
    302 			}
    303 			fetch = false;
    304 		}
    305 		fputs( last_line.data(), stdout );
    306 	}
    307 	fputc( *SEP_ENTRY, stdout );
    308 	free( buf );
    309 	return EXIT_SUCCESS;
    310 }
    311 
    312 bool allocate_markers(
    313 		char const * const FIELDS,
    314 		field_marker_t& field_cache
    315 )
    316 {
    317 	field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases
    318 	size_t last_field = 0;
    319 
    320 	char const * fields = FIELDS - 1;
    321 	do
    322 	{
    323 		++fields;
    324 		if( isdigit( *fields ) )
    325 		{
    326 			last_field = last_field * 10 + static_cast<size_t>( *fields - '0' );
    327 		}
    328 		else if( *fields == ',' || *fields == 0 )
    329 		{
    330 			size_t max = std::max( field_cache.size(), last_field );
    331 			field_cache.resize( max );
    332 			field_cache[last_field - 1].define( 0, 0 );
    333 			last_field = 0;
    334 		}
    335 		else
    336 		{
    337 			fputs( "ERROR: FIELDS contains illegal characters\n", stderr );
    338 			return true;
    339 		}
    340 	}while( *fields );
    341 	field_cache.shrink_to_fit();
    342 	return false;
    343 }
    344 
    345 void print_help( char const* pgm, FILE* target, opt_desc_t const* start, opt_desc_t const* end )
    346 {
    347 	uint16_t w, h;
    348 	if( term_ch_size( &w, &h, STDOUT_FILENO ) )
    349 	{
    350 		w = 80;
    351 		fputs( "TODO: could not get terminal's size\n", stderr );
    352 	}
    353 	fputs( "Usage: ", target );
    354 	fputs( pgm, target );
    355 	fputs( " [OPTIONS]\nDescription:\n", target );
    356 	char desc[] =
    357 		"This program reads stdin and when consecutive lines have specific "
    358 		"fields all containing the same value, prints them replacing the "
    359 		"newline character by the 1st character in FIELD_SEP.\n"
    360 		"Fields are delimited by the FIELD_SEP environment variable. If not "
    361 		"defined, \" \\t\" is used instead (see isblank(3)).\n"
    362 		"Fields to use are defined by the environment variable FIELDS, which "
    363 		"only use unsigned decimal integers separated by commas, other "
    364 		"characters makes the value invalid.\n"
    365 		"If FIELDS is not defined or invalid, exits with an error.\n"
    366 		"Empty field indexes (\"1,,3\") are ignored (will resolve in \"1,3\").\n"
    367 		"Do not work if input is not in line mode.\n"
    368 		"Line separator is defined by ENTRY_SEP, or \"\\n\" if not defined.\n"
    369 		;
    370 	if( indent_txt( desc, std::end( desc ), 1, w, 8, target ) )
    371 	{
    372 		fputs( "TODO: handle indent_txt's errors\n", stderr );
    373 	}
    374 
    375 	fputs( "Options:\n", target );
    376 	print_opts( target, start, end );
    377 }
    378