From 9cac4c298ee92c1695b0695951f1488884a7ca73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Ronvel?= <cedric.ronvel@gmail.com>
Date: Tue, 17 Aug 2021 15:34:26 +0200
Subject: [PATCH] Brand new .naturalSort() code, it was previously borrowed
 code that was slow and vulnerable to RegExp DoS, the new code is made by
 myself, has no RegExp, only matching forward, is more flexible and easiest to
 maintain (#3)

---
 CHANGELOG           |   6 ++
 lib/naturalSort.js  | 161 ++++++++++++++++++++++++++++++--------------
 lib/string.js       |   2 +-
 package.json        |   2 +-
 test/string-test.js |  37 ++++++++--
 5 files changed, 150 insertions(+), 58 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 1e2e3bf..dff4b19 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,10 @@
 
+v0.12.8
+-------
+
+Brand new .naturalSort() code, it was previously borrowed code that was slow and vulnerable to RegExp DoS, the new code is made by myself, has no RegExp, only matching forward, is more flexible and easiest to maintain (#3)
+
+
 v0.12.7
 -------
 
diff --git a/lib/naturalSort.js b/lib/naturalSort.js
index 7e2531c..adafe1d 100644
--- a/lib/naturalSort.js
+++ b/lib/naturalSort.js
@@ -28,57 +28,118 @@
 
 
 
-/*
- * Natural Sort algorithm for Javascript - Version 0.8 - Released under MIT license
- * Author: Jim Palmer (based on chunking idea from Dave Koelle)
- */
-module.exports = function( a , b ) {
-	var re = /(^([+-]?(?:\d*)(?:\.\d*)?(?:[eE][+-]?\d+)?)?$|^0x[\da-fA-F]+$|\d+)/g ,
-		sre = /^\s+|\s+$/g ,   // trim pre-post whitespace
-		snre = /\s+/g ,        // normalize all whitespace to single ' ' character
-		dre = /(^([\w ]+,?[\w ]+)?[\w ]+,?[\w ]+\d+:\d+(:\d+)?[\w ]?|^\d{1,4}[/-]\d{1,4}[/-]\d{1,4}|^\w+, \w+ \d+, \d{4})/ ,
-		hre = /^0x[0-9a-f]+$/i ,
-		ore = /^0/ ,
-		i = function( s ) {
-			return ( '' + s ).toLowerCase().replace( sre , '' ) ;
-		} ,
-		// convert all to strings strip whitespace
-		x = i( a ) || '' ,
-		y = i( b ) || '' ,
-		// chunk/tokenize
-		xN = x.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
-			.replace( /^\0/ , '' )
-			.split( '\0' ) ,
-		yN = y.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
-			.replace( /^\0/ , '' )
-			.split( '\0' ) ,
-		// numeric, hex or date detection
-		xD = parseInt( x.match( hre ) , 16 ) || ( xN.length !== 1 && Date.parse( x ) ) ,
-		yD = parseInt( y.match( hre ) , 16 ) || xD && y.match( dre ) && Date.parse( y ) || null ,
-		normChunk = function( s , l ) {
-			// normalize spaces; find floats not starting with '0', string or 0 if not defined (Clint Priest)
-			return ( ! s.match( ore ) || l === 1 ) && parseFloat( s ) || s.replace( snre , ' ' ).replace( sre , '' ) || 0 ;	// jshint ignore:line
-		} ,
-		oFxNcL , oFyNcL ;
-	// first try and sort Hex codes or Dates
-	if ( yD ) {
-		if ( xD < yD ) { return -1 ; }
-		else if ( xD > yD ) { return 1 ; }
+const CONTROL_CLASS = 1 ;
+const WORD_SEPARATOR_CLASS = 2 ;
+const LETTER_CLASS = 3 ;
+const NUMBER_CLASS = 4 ;
+const SYMBOL_CLASS = 5 ;
+
+
+
+function getCharacterClass( char , code ) {
+	if ( isWordSeparator( code ) ) { return WORD_SEPARATOR_CLASS ; }
+	if ( code <= 0x1f || code === 0x7f ) { return CONTROL_CLASS ; }
+	if ( isNumber( code ) ) { return NUMBER_CLASS ; }
+	// Here we assume that a letter is a char with a “case”
+	if ( char.toUpperCase() !== char.toLowerCase() ) { return LETTER_CLASS ; }
+	return SYMBOL_CLASS ;
+}
+
+
+
+function isWordSeparator( code ) {
+	if (
+		// space, tab, no-break space
+		code === 0x20 || code === 0x09 || code === 0xa0 ||
+		// hyphen, underscore
+		code === 0x2d || code === 0x5f
+	) {
+		return true ;
 	}
-	// natural sorting through split numeric strings and default strings
-	for( var cLoc = 0 , xNl = xN.length , yNl = yN.length , numS = Math.max( xNl , yNl ) ; cLoc < numS ; cLoc ++ ) {
-		oFxNcL = normChunk( xN[cLoc] , xNl ) ;
-		oFyNcL = normChunk( yN[cLoc] , yNl ) ;
-		// handle numeric vs string comparison - number < string - (Kyle Adams)
-		if ( isNaN( oFxNcL ) !== isNaN( oFyNcL ) ) { return ( isNaN( oFxNcL ) ) ? 1 : -1 ; }
-		// rely on string comparison if different types - i.e. '02' < 2 != '02' < '2'
-		else if ( typeof oFxNcL !== typeof oFyNcL ) {
-			oFxNcL += '' ;
-			oFyNcL += '' ;
+
+	return false ;
+}
+
+
+
+function isNumber( code ) {
+	if ( code >= 0x30 && code <= 0x39 ) { return true ; }
+	return false ;
+}
+
+
+
+function naturalSort( a , b ) {
+	a = '' + a ;
+	b = '' + b ;
+
+	var aIndex , aEndIndex , aChar , aCode , aClass , aCharLc , aNumber ,
+		aTrim = a.trim() ,
+		aLength = aTrim.length ,
+		bIndex , bEndIndex , bChar , bCode , bClass , bCharLc , bNumber ,
+		bTrim = b.trim() ,
+		bLength = bTrim.length ,
+		advantage = 0 ;
+
+	for ( aIndex = bIndex = 0 ; aIndex < aLength && bIndex < bLength ; aIndex ++ , bIndex ++ ) {
+		aChar = aTrim[ aIndex ] ;
+		bChar = bTrim[ bIndex ] ;
+		aCode = aTrim.charCodeAt( aIndex ) ;
+		bCode = bTrim.charCodeAt( bIndex ) ;
+		aClass = getCharacterClass( aChar , aCode ) ;
+		bClass = getCharacterClass( bChar , bCode ) ;
+		if ( aClass !== bClass ) { return aClass - bClass ; }
+
+		switch ( aClass ) {
+			case WORD_SEPARATOR_CLASS :
+				// Eat all white chars and continue
+				while ( isWordSeparator( aTrim.charCodeAt( aIndex + 1 ) ) ) { aIndex ++ ; }
+				while ( isWordSeparator( bTrim.charCodeAt( bIndex + 1 ) ) ) { bIndex ++ ; }
+				break ;
+
+			case CONTROL_CLASS :
+			case SYMBOL_CLASS :
+				if ( aCode !== bCode ) { return aCode - bCode ; }
+				break ;
+
+			case LETTER_CLASS :
+				aCharLc = aChar.toLowerCase() ;
+				bCharLc = bChar.toLowerCase() ;
+				if ( aCharLc !== bCharLc ) { return aCharLc > bCharLc ? 1 : -1 ; }
+
+				// As a last resort, we would sort uppercase first
+				if ( ! advantage && aChar !== bChar ) { advantage = aChar !== aCharLc ? -1 : 1 ; }
+
+				break ;
+
+			case NUMBER_CLASS :
+				// Lookup for a whole number and parse it
+				aEndIndex = aIndex + 1 ;
+				while ( isNumber( aTrim.charCodeAt( aEndIndex ) ) ) { aEndIndex ++ ; }
+				aNumber = parseFloat( aTrim.slice( aIndex , aEndIndex ) ) ;
+
+				bEndIndex = bIndex + 1 ;
+				while ( isNumber( bTrim.charCodeAt( bEndIndex ) ) ) { bEndIndex ++ ; }
+				bNumber = parseFloat( bTrim.slice( bIndex , bEndIndex ) ) ;
+
+				if ( aNumber !== bNumber ) { return aNumber - bNumber ; }
+
+				// As a last resort, we would sort the number with the less char first
+				if ( ! advantage && aEndIndex - aIndex !== bEndIndex - bIndex ) { advantage = ( aEndIndex - aIndex ) - ( bEndIndex - bIndex ) ; }
+
+				// Advance the index at the end of the number area
+				aIndex = aEndIndex - 1 ;
+				bIndex = bEndIndex - 1 ;
+				break ;
 		}
-		if ( oFxNcL < oFyNcL ) { return -1 ; }
-		if ( oFxNcL > oFyNcL ) { return 1 ; }
 	}
-	return 0 ;
-} ;
+
+	// If there was an “advantage”, use it now
+	if ( advantage ) { return advantage ; }
+
+	// Finally, sort by remaining char, or by trimmed length or by full length
+	return ( aLength - aIndex ) - ( bLength - bIndex ) || aLength - bLength || a.length - b.length ;
+}
+
+module.exports = naturalSort ;
 
diff --git a/lib/string.js b/lib/string.js
index 5d62f70..0475931 100644
--- a/lib/string.js
+++ b/lib/string.js
@@ -28,7 +28,7 @@
 
 
 
-var stringKit = {} ;
+const stringKit = {} ;
 module.exports = stringKit ;
 
 
diff --git a/package.json b/package.json
index 32eb67e..8a80b8a 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "string-kit",
-  "version": "0.12.7",
+  "version": "0.12.8",
   "engines": {
     "node": ">=6.0.0"
   },
diff --git a/test/string-test.js b/test/string-test.js
index 215851c..b784f10 100644
--- a/test/string-test.js
+++ b/test/string-test.js
@@ -1063,6 +1063,37 @@ describe( "Fuzzy string matching" , () => {
 
 
 
+describe( "Natural sort" , () => {
+
+	it( "basic natural sort tests" , () => {
+		expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;
+		
+		// Case insensitive
+		expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
+		expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;
+		
+		// Uppercase first as a tie-breaker
+		expect( [ 'one' , 'One' , 'two' , 'Two' , 'Three' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'one' , 'Three' , 'three' , 'Two' , 'two' ] ) ;
+		
+		// Lesser number first
+		expect( [ 'abc121' , 'abc17' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc17' , 'abc121' , 'abc134' ] ) ;
+		
+		// White space / word separator insensitive
+		expect( [ '  One  ' , '   Two   ' , 'three' ].sort( string.naturalSort ) ).to.equal( [ '  One  ' , 'three' , '   Two   ' ] ) ;
+		expect( [ 'abc   121' , 'abc 17' , 'abc  12' , 'abc    134' ].sort( string.naturalSort ) ).to.equal( [ 'abc  12' , 'abc 17' , 'abc   121' , 'abc    134' ] ) ;
+		expect( [ 'a-123-a' , 'a_12_a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a_12_a' , 'a 18 a' , 'a-123-a' ] ) ;
+		expect( [ 'a_123_a' , 'a-12-a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a-12-a' , 'a 18 a' , 'a_123_a' ] ) ;
+		
+		// Number with shorter char-width as a tie-breaker
+		expect( [ 'abc00012' , 'abc012' , 'abc017' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc012' , 'abc00012' , 'abc017' , 'abc134' ] ) ;
+		
+		// Symbols
+		expect( [ ';+$' , '!:;,' , '“”' ].sort( string.naturalSort ) ).to.equal( [ "!:;," , ";+$" , "“”" ] ) ;
+	} ) ;
+} ) ;
+
+
+
 describe( "Misc" , () => {
 
 	it( ".resize()" , () => {
@@ -1071,12 +1102,6 @@ describe( "Misc" , () => {
 		expect( string.resize( 'bobby' , 8 ) ).to.be( 'bobby   ' ) ;
 	} ) ;
 
-	it( ".naturalSort()" , () => {
-		expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;
-		expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
-		expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;
-	} ) ;
-
 	it( ".occurrenceCount()" , () => {
 		expect( string.occurrenceCount( '' , '' ) ).to.be( 0 ) ;
 		expect( string.occurrenceCount( 'three' , '' ) ).to.be( 0 ) ;