renamed the command lmerge to avoid a conflict with rcs merge binary (wonder if someone uses that though) - tools

commit 17c3065ecfa59bee483911aa7569852c57bda17b
parent cb277a9720d9e8ece11b3f66bc1e47bf07069574
Author: Morel Bérenger <berenger.morel@neutralite.org>
Date:   Wed, 26 Feb 2020 07:59:35 +0100

renamed the command lmerge to avoid a conflict with rcs merge binary (wonder if someone uses that though)

Diffstat:
M Makefile  | 8 ++++----
M README  | 6 +++---
A lmerge.1.md  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lmerge.cpp  | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D merge.1.md  | 68 --------------------------------------------------------------------
D merge.cpp  | 287 -------------------------------------------------------------------------------

6 files changed, 364 insertions(+), 362 deletions(-)
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@ CC ?= cc
 CXX ?= c++
 
 .PHONY: all
-all: manpages merge.1 merge
+all: manpages lmerge.1 lmerge
 
 %.1: %.1.md
 	pandoc -s --to=man $< -o $@
@@ -10,11 +10,11 @@ all: manpages merge.1 merge
 %.o: %.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-merge: merge.o
+lmerge: lmerge.o
 	$(CXX) -o $@ $^
 
-manpages: merge.1
+manpages: lmerge.1
 
 .PHONY: clean
 clean:
-	rm -f merge merge.1 *.o
+	rm -f lmerge lmerge.1 *.o
diff --git a/README b/README
@@ -2,7 +2,7 @@ This tool merges sequential entries if they have some fields with same values.
 
 USAGE:
 
-See merge.1.md
+See lmerge.1.md
 
 DEPENDENCIES:
 
@@ -22,8 +22,8 @@ example it's more hackish un current stable (Buster) than it was in old-stable
 On Debian buster, I do this for example (beware, every single change can break
 the build):
 
-clang++ -o merge \
-  merge.cpp /usr/lib/x86_64-linux-musl/crt1.o \
+clang++ -o lmerge \
+  lmerge.cpp /usr/lib/x86_64-linux-musl/crt1.o \
   -Os -nostdlib -static -fno-exceptions -stdlib=libc++ -nobuiltininc -nostdinc++ \
   -L /usr/lib/x86_64-linux-musl  \
   -lpthread -lc \
diff --git a/lmerge.1.md b/lmerge.1.md
@@ -0,0 +1,68 @@
+% lmerge(1) lmerge manpage
+% Bérenger Morel
+% 2020-02-25
+
+# NAME
+
+*lmerge* - merges entries with common fields
+
+# SYNOPSIS
+
+`cat foo | *lmerge*`
+
+# DESCRIPTION
+
+Merges consecutive entries when they share a common field.
+Entries are read from stdin.
+
+*lmerge* does not remove duplicated fields.
+
+# OPTIONS
+
+For now, options are managed through environment variables:
+
+FIELDS
+
+: list of indexes (starting from 0, separated with **commas** (\',\', 0x2C) of
+the fields that will be compared.
+
+FIELD_SEP
+
+: list of characters that will be considered as field separators.
+Defaults to **space** (\' \', 0x20) and **horizontal tabulation** (\'\\t\', 0x09).
+
+ENTRY_SEP
+
+: list of characters that will be considered as entry separators.
+Defaults to **newline** (\'\\n\', 0x0A).
+
+# EXAMPLE
+
+This invocation:
+
+```sh
+FIELD_SEP=": \t" FIELDS="1,3" ./lmerge <<EOF
+0 foo:hello:1
+1 bar:hello:2
+	2:foo:world:3
+	2:bar:world:4
+EOF      
+```
+
+will generate this result:
+
+```
+0 foo:hello:1
+1 bar:hello:2
+	2:foo:world:3:	2:bar:world:4
+```
+
+# BUGS
+
+Multi-byte characters can not be used for FIELD_SEP and ENTRY_SEP.
+
+# TODO
+
+Allow the use of command-line parameters to configure behavior.
+
+# SEE ALSO
diff --git a/lmerge.cpp b/lmerge.cpp
@@ -0,0 +1,289 @@
+#ifdef LIBCPP_MUSL_STATIC
+#define __GLIBC_PREREQ(x,y) 0
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include <algorithm>
+#include <vector>
+
+/**
+ * This program reads stdin and when consecutive lines have specific fields all
+ * containing the same value, prints them replacing the newline character by
+ * the 1st character in FIELD_SEP.
+ * Fields are delimited by the FIELD_SEP environment variable. If not defined,
+ * " \\t" is used instead (see isblank(3)).
+ * Fields to use are defined by the environment variable FIELDS, which only
+ * use unsigned decimal integers separated by commas, other characters makes the
+ * value invalid.
+ * If FIELDS is not defined or invalid, exits with an error.
+ * Empty field indexes ("1,,3") are ignored (will resolve in "1,3").
+ * Do not work if input is not in line mode.
+ * Line separator is defined by ENTRY_SEP, or "\\n" if not defined.
+ *
+ * TODO:
+ * * check that ENTRY_SEP works as expected;
+ * * fix the fact input needs a "\\n" at end of last line for it to be merged;
+ * * UTF-8 support (field separators);
+ * * providing FIELDS variable as command-line option;
+ * * -v/--version option;
+ * * -h/--help    option;
+ * * remove bloated STL containers;
+ * * allow to customize the memory allocation scheme at runtime;
+ * * allow to not print twice merged fields;
+ * * allow to set verbosity on stderr;
+ * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions;
+ * * print as many lines as there where duplicates?
+ *
+ * Coding rules:
+ * * const affect what is before it, so it must follow the type;
+ **/
+
+#include "vector.hpp"
+
+class field_marker;
+typedef vector<char> line_cache;
+typedef vector<field_marker> field_marker_t;
+
+class field_marker
+{
+	uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX;
+
+public:
+	bool ignore( void ) const
+	{
+		return m_start == m_end && m_start == UINT16_MAX;
+	}
+
+	void define( uint16_t start, uint16_t end )
+	{
+		assert( end >= start && start != UINT8_MAX && end != UINT8_MAX );
+		m_start = start; m_end = end;
+	}
+
+	uint16_t start( void ) const
+	{
+		assert( !ignore() );
+		return m_start;
+	}
+
+	uint16_t end( void ) const
+	{
+		assert( !ignore() );
+		return m_end;
+	}
+};
+
+bool allocate_markers(
+		char const * const FIELDS,
+		field_marker_t& field_cache
+);
+
+int main( void )
+{
+	char const * const DEFAULT_FIELD_SEP = " \t";
+	char const * const DEFAULT_ENTRY_SEP = "\n";
+
+	char const * SEP_START = getenv( "FIELD_SEP" );
+	if( !SEP_START )
+	{
+		SEP_START = DEFAULT_FIELD_SEP;
+	}
+	char const * SEP_ENTRY = getenv( "ENTRY_SEP" );
+	if( !SEP_ENTRY )
+	{
+		SEP_ENTRY = DEFAULT_ENTRY_SEP;
+	}
+
+	char const * const FIELDS = getenv( "FIELDS" );
+	if( !FIELDS )
+	{
+		fputs( "ERROR: FIELDS is not defined\n", stderr );
+		return EXIT_FAILURE;
+	}
+
+	if( strlen( FIELDS ) == 0 )
+	{
+		fputs( "ERROR: FIELDS is empty\n", stderr );
+		return EXIT_FAILURE;
+	}
+
+	field_marker_t field_cache;
+	if( allocate_markers( FIELDS, field_cache ) )
+	{
+		return EXIT_FAILURE;
+	}
+
+	size_t buf_sz = 2048;
+	char* buf = nullptr;
+
+	// allocating a cache of at least 16 bytes.
+	// Note: I don't see how merging lines smaller than 16 bytes can be useful
+	// also, not even enough mem for that would indicate bigger problems...
+	while( !buf && buf_sz >= 32 )
+	{
+		buf_sz /= 2;
+		buf = static_cast<char*>( malloc( buf_sz ) );
+	}
+
+	if( !buf )
+	{
+		fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno );
+		return EXIT_FAILURE;
+	}
+
+	bool fetch = true;
+	line_cache last_line;
+	char const * const SEP_END = SEP_START + strlen( SEP_START );
+	while( !feof( stdin ) )
+	{
+		if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) )
+		{
+			free( buf );
+			buf = nullptr;
+			buf_sz = 0;
+			if( !feof( stdin ) )
+			{
+				fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno );
+				return EXIT_FAILURE;
+			}
+			break;
+		}
+
+		size_t str_sz = strlen( buf );
+		if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) )
+		{
+			fprintf( stderr, "ERROR: buffer too small for some lines\n" );
+			return EXIT_FAILURE;
+		}
+		
+		if( !fetch )
+		{
+			char const* dst_ptr = buf;
+			for( size_t i = 0; i < field_cache.size(); ++i )
+			{
+				field_marker const& src = field_cache[i];
+				if( src.ignore() )
+				{
+					char const *sep = SEP_END;
+					while( sep == SEP_END )
+					{
+						++dst_ptr;
+						sep = SEP_START;
+						for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){}
+					}
+					++dst_ptr;
+					continue;
+				}
+				char const * src_ptr = last_line.data() + src.start();
+				size_t len = src.end() - src.start();
+				if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) )
+				{
+					fetch = true;
+					break;
+				}
+
+				if( 0 != memcmp( dst_ptr, src_ptr, len ) )
+				{
+					fetch = true;
+					break;
+				}
+
+				char last = dst_ptr[len];
+				char const * sep_ = SEP_START;
+				assert( sep_ != nullptr );
+				while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last )
+				{
+					++sep_;
+				}
+				if( 0 == *sep_ )
+				{
+					fetch = true;
+					break;
+				}
+				dst_ptr += len;
+				assert( dst_ptr >= buf );
+			}
+
+			fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout );
+		}
+
+		last_line.assign( buf, buf + str_sz );
+		if( last_line.back() == *SEP_ENTRY )
+		{
+			last_line.back() = 0;
+		}
+
+		if( fetch )
+		{
+			line_cache::iterator start     = last_line.begin();
+			line_cache::iterator cache_end = last_line.end();
+			size_t field_index = 0;
+			while( start != cache_end && field_index < field_cache.size() )
+			{
+				auto end = last_line.end();
+				if( last_line.back() == 0 )
+				{
+					--end;
+				}
+				line_cache::iterator it = std::find_first_of
+					(
+					 start, end,
+					 SEP_START, SEP_END
+					);
+				if( !field_cache[field_index].ignore() )
+				{
+					field_cache[field_index].define(
+							static_cast<uint16_t>( start - last_line.begin() ),
+							static_cast<uint16_t>( it - last_line.begin() )
+							);
+				}
+				start = it + 1;
+				++field_index;
+			}
+			fetch = false;
+		}
+		fputs( last_line.data(), stdout );
+	}
+	fputc( *SEP_ENTRY, stdout );
+	return EXIT_SUCCESS;
+}
+
+bool allocate_markers(
+		char const * const FIELDS,
+		field_marker_t& field_cache
+)
+{
+	field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases
+	size_t last_field = 0;
+
+	char const * fields = FIELDS - 1;
+	do
+	{
+		++fields;
+		if( isdigit( *fields ) )
+		{
+			last_field = last_field * 10 + static_cast<size_t>( *fields - '0' );
+		}
+		else if( *fields == ',' || *fields == 0 )
+		{
+			size_t max = std::max( field_cache.size(), last_field );
+			field_cache.resize( max );
+			field_cache[last_field - 1].define( 0, 0 );
+			last_field = 0;
+		}
+		else
+		{
+			fputs( "ERROR: FIELDS contains illegal characters\n", stderr );
+			return true;
+		}
+	}while( *fields );
+	field_cache.shrink_to_fit();
+	return false;
+}
diff --git a/merge.1.md b/merge.1.md
@@ -1,68 +0,0 @@
-% merge(1) merge manpage
-% Bérenger Morel
-% 2020-02-25
-
-# NAME
-
-*merge* - merges entries with common fields
-
-# SYNOPSIS
-
-`cat foo | *merge*`
-
-# DESCRIPTION
-
-Merges consecutive entries when they share a common field.
-Entries are read from stdin.
-
-*merge* does not remove duplicated fields.
-
-# OPTIONS
-
-For now, options are managed through environment variables:
-
-FIELDS
-
-: list of indexes (starting from 0, separated with **commas** (\',\', 0x2C) of
-the fields that will be compared.
-
-FIELD_SEP
-
-: list of characters that will be considered as field separators.
-Defaults to **space** (\' \', 0x20) and **horizontal tabulation** (\'\\t\', 0x09).
-
-ENTRY_SEP
-
-: list of characters that will be considered as entry separators.
-Defaults to **newline** (\'\\n\', 0x0A).
-
-# EXAMPLE
-
-This invocation:
-
-```sh
-FIELD_SEP=": \t" FIELDS="1,3" ./merge <<EOF
-0 foo:hello:1
-1 bar:hello:2
-	2:foo:world:3
-	2:bar:world:4
-EOF      
-```
-
-will generate this result:
-
-```
-0 foo:hello:1
-1 bar:hello:2
-	2:foo:world:3:	2:bar:world:4
-```
-
-# BUGS
-
-Multi-byte characters can not be used for FIELD_SEP and ENTRY_SEP.
-
-# TODO
-
-Allow the use of command-line parameters to configure behavior.
-
-# SEE ALSO
diff --git a/merge.cpp b/merge.cpp
@@ -1,287 +0,0 @@
-#ifdef LIBCPP_MUSL_STATIC
-#define __GLIBC_PREREQ(x,y) 0
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <stdint.h>
-#include <ctype.h>
-
-#include <algorithm>
-#include <vector>
-
-/**
- * This program reads stdin and when consecutive lines have specific fields all
- * containing the same value, prints them replacing the newline character by
- * the 1st character in FIELD_SEP.
- * Fields are delimited by the FIELD_SEP environment variable. If not defined,
- * " \\t" is used instead (see isblank(3)).
- * Fields to use are defined by the environment variable FIELDS, which only
- * use unsigned decimal integers separated by commas, other characters makes the
- * value invalid.
- * If FIELDS is not defined or invalid, exits with an error.
- * Empty field indexes ("1,,3") are ignored (will resolve in "1,3").
- * Do not work if input is not in line mode.
- * Line separator is defined by ENTRY_SEP, or "\\n" if not defined.
- *
- * TODO:
- * * check that ENTRY_SEP works as expected;
- * * fix the fact input needs a "\\n" at end of last line for it to be merged;
- * * UTF-8 support (field separators);
- * * providing FIELDS variable as command-line option;
- * * -v/--version option;
- * * -h/--help    option;
- * * remove bloated STL containers;
- * * allow to customize the memory allocation scheme at runtime;
- * * allow to not print twice merged fields;
- * * allow to set verbosity on stderr;
- * * remove hard-coded limit of UINT16_MAX - 1 for fields start/stop positions;
- * * print as many lines as there where duplicates?
- *
- * Coding rules:
- * * const affect what is before it, so it must follow the type;
- **/
-
-class field_marker;
-typedef std::vector<char> line_cache;
-typedef std::vector<field_marker> field_marker_t;
-
-class field_marker
-{
-	uint16_t m_start = UINT16_MAX, m_end = UINT16_MAX;
-
-public:
-	bool ignore( void ) const
-	{
-		return m_start == m_end && m_start == UINT16_MAX;
-	}
-
-	void define( uint16_t start, uint16_t end )
-	{
-		assert( end >= start && start != UINT8_MAX && end != UINT8_MAX );
-		m_start = start; m_end = end;
-	}
-
-	uint16_t start( void ) const
-	{
-		assert( !ignore() );
-		return m_start;
-	}
-
-	uint16_t end( void ) const
-	{
-		assert( !ignore() );
-		return m_end;
-	}
-};
-
-bool allocate_markers(
-		char const * const FIELDS,
-		field_marker_t& field_cache
-);
-
-int main( void )
-{
-	char const * const DEFAULT_FIELD_SEP = " \t";
-	char const * const DEFAULT_ENTRY_SEP = "\n";
-
-	char const * SEP_START = getenv( "FIELD_SEP" );
-	if( !SEP_START )
-	{
-		SEP_START = DEFAULT_FIELD_SEP;
-	}
-	char const * SEP_ENTRY = getenv( "ENTRY_SEP" );
-	if( !SEP_ENTRY )
-	{
-		SEP_ENTRY = DEFAULT_ENTRY_SEP;
-	}
-
-	char const * const FIELDS = getenv( "FIELDS" );
-	if( !FIELDS )
-	{
-		fputs( "ERROR: FIELDS is not defined\n", stderr );
-		return EXIT_FAILURE;
-	}
-
-	if( strlen( FIELDS ) == 0 )
-	{
-		fputs( "ERROR: FIELDS is empty\n", stderr );
-		return EXIT_FAILURE;
-	}
-
-	field_marker_t field_cache;
-	if( allocate_markers( FIELDS, field_cache ) )
-	{
-		return EXIT_FAILURE;
-	}
-
-	size_t buf_sz = 2048;
-	char* buf = nullptr;
-
-	// allocating a cache of at least 16 bytes.
-	// Note: I don't see how merging lines smaller than 16 bytes can be useful
-	// also, not even enough mem for that would indicate bigger problems...
-	while( !buf && buf_sz >= 32 )
-	{
-		buf_sz /= 2;
-		buf = static_cast<char*>( malloc( buf_sz ) );
-	}
-
-	if( !buf )
-	{
-		fprintf( stderr, "ERROR: malloc %s(%d)\n", strerror( errno ), errno );
-		return EXIT_FAILURE;
-	}
-
-	bool fetch = true;
-	line_cache last_line;
-	char const * const SEP_END = SEP_START + strlen( SEP_START );
-	while( !feof( stdin ) )
-	{
-		if( !fgets( buf, static_cast<int>( buf_sz ), stdin ) )
-		{
-			free( buf );
-			buf = nullptr;
-			buf_sz = 0;
-			if( !feof( stdin ) )
-			{
-				fprintf( stderr, "ERROR: fgets %s(%d)\n", strerror( errno ), errno );
-				return EXIT_FAILURE;
-			}
-			break;
-		}
-
-		size_t str_sz = strlen( buf );
-		if( str_sz == buf_sz - 1 && buf[str_sz] != '\n' && !feof( stdin ) )
-		{
-			fprintf( stderr, "ERROR: buffer too small for some lines\n" );
-			return EXIT_FAILURE;
-		}
-		
-		if( !fetch )
-		{
-			char const* dst_ptr = buf;
-			for( size_t i = 0; i < field_cache.size(); ++i )
-			{
-				field_marker const& src = field_cache[i];
-				if( src.ignore() )
-				{
-					char const *sep = SEP_END;
-					while( sep == SEP_END )
-					{
-						++dst_ptr;
-						sep = SEP_START;
-						for( ; sep != SEP_END && *dst_ptr && *dst_ptr != *sep; ++sep ){}
-					}
-					++dst_ptr;
-					continue;
-				}
-				char const * src_ptr = last_line.data() + src.start();
-				size_t len = src.end() - src.start();
-				if( len > buf_sz - static_cast<size_t>( dst_ptr - buf ) )
-				{
-					fetch = true;
-					break;
-				}
-
-				if( 0 != memcmp( dst_ptr, src_ptr, len ) )
-				{
-					fetch = true;
-					break;
-				}
-
-				char last = dst_ptr[len];
-				char const * sep_ = SEP_START;
-				assert( sep_ != nullptr );
-				while( 0 != *sep_ && *sep_ != last && *SEP_ENTRY != last )
-				{
-					++sep_;
-				}
-				if( 0 == *sep_ )
-				{
-					fetch = true;
-					break;
-				}
-				dst_ptr += len;
-				assert( dst_ptr >= buf );
-			}
-
-			fputc( fetch ? *SEP_ENTRY : * SEP_START, stdout );
-		}
-
-		last_line.assign( buf, buf + str_sz );
-		if( last_line.back() == *SEP_ENTRY )
-		{
-			last_line.back() = 0;
-		}
-
-		if( fetch )
-		{
-			line_cache::iterator start     = last_line.begin();
-			line_cache::iterator cache_end = last_line.end();
-			size_t field_index = 0;
-			while( start != cache_end && field_index < field_cache.size() )
-			{
-				auto end = last_line.end();
-				if( last_line.back() == 0 )
-				{
-					--end;
-				}
-				line_cache::iterator it = std::find_first_of
-					(
-					 start, end,
-					 SEP_START, SEP_END
-					);
-				if( !field_cache[field_index].ignore() )
-				{
-					field_cache[field_index].define(
-							static_cast<uint16_t>( start - last_line.begin() ),
-							static_cast<uint16_t>( it - last_line.begin() )
-							);
-				}
-				start = it + 1;
-				++field_index;
-			}
-			fetch = false;
-		}
-		fputs( last_line.data(), stdout );
-	}
-	fputc( *SEP_ENTRY, stdout );
-	return EXIT_SUCCESS;
-}
-
-bool allocate_markers(
-		char const * const FIELDS,
-		field_marker_t& field_cache
-)
-{
-	field_cache.reserve( UINT8_MAX ); //255 fields should fit most cases
-	size_t last_field = 0;
-
-	char const * fields = FIELDS - 1;
-	do
-	{
-		++fields;
-		if( isdigit( *fields ) )
-		{
-			last_field = last_field * 10 + static_cast<size_t>( *fields - '0' );
-		}
-		else if( *fields == ',' || *fields == 0 )
-		{
-			size_t max = std::max( field_cache.size(), last_field );
-			field_cache.resize( max );
-			field_cache[last_field - 1].define( 0, 0 );
-			last_field = 0;
-		}
-		else
-		{
-			fputs( "ERROR: FIELDS contains illegal characters\n", stderr );
-			return true;
-		}
-	}while( *fields );
-	field_cache.shrink_to_fit();
-	return false;
-}

	tools various tools
	git clone git://deadbeef.fr/tools.git
	Log \| Files \| Refs \| README \| LICENSE

M	Makefile	\|	8	++++----
M	README	\|	6	+++---
A	lmerge.1.md	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lmerge.cpp	\|	289	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	merge.1.md	\|	68	--------------------------------------------------------------------
D	merge.cpp	\|	287	-------------------------------------------------------------------------------