Brand new .naturalSort() code, it was previously borrowed code that w…

…as slow and vulnerable to RegExp DoS, the new code is made by myself, has no RegExp, only matching forward, is more flexible and easiest to maintain (#3)
cronvel · Aug 17, 2021 · 9cac4c2 · 9cac4c2
1 parent 694f697
commit 9cac4c2
Show file tree

Hide file tree

Showing 5 changed files with 150 additions and 58 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,10 @@
 
+v0.12.8
+-------
+
+Brand new .naturalSort() code, it was previously borrowed code that was slow and vulnerable to RegExp DoS, the new code is made by myself, has no RegExp, only matching forward, is more flexible and easiest to maintain (#3)
+
+
 v0.12.7
 -------
 

diff --git a/lib/naturalSort.js b/lib/naturalSort.js
@@ -28,57 +28,118 @@
 
 
 
-/*
- * Natural Sort algorithm for Javascript - Version 0.8 - Released under MIT license
- * Author: Jim Palmer (based on chunking idea from Dave Koelle)
- */
-module.exports = function( a , b ) {
-	var re = /(^([+-]?(?:\d*)(?:\.\d*)?(?:[eE][+-]?\d+)?)?$|^0x[\da-fA-F]+$|\d+)/g ,
-		sre = /^\s+|\s+$/g ,   // trim pre-post whitespace
-		snre = /\s+/g ,        // normalize all whitespace to single ' ' character
-		dre = /(^([\w ]+,?[\w ]+)?[\w ]+,?[\w ]+\d+:\d+(:\d+)?[\w ]?|^\d{1,4}[/-]\d{1,4}[/-]\d{1,4}|^\w+, \w+ \d+, \d{4})/ ,
-		hre = /^0x[0-9a-f]+$/i ,
-		ore = /^0/ ,
-		i = function( s ) {
-			return ( '' + s ).toLowerCase().replace( sre , '' ) ;
-		} ,
-		// convert all to strings strip whitespace
-		x = i( a ) || '' ,
-		y = i( b ) || '' ,
-		// chunk/tokenize
-		xN = x.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
-			.replace( /^\0/ , '' )
-			.split( '\0' ) ,
-		yN = y.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
-			.replace( /^\0/ , '' )
-			.split( '\0' ) ,
-		// numeric, hex or date detection
-		xD = parseInt( x.match( hre ) , 16 ) || ( xN.length !== 1 && Date.parse( x ) ) ,
-		yD = parseInt( y.match( hre ) , 16 ) || xD && y.match( dre ) && Date.parse( y ) || null ,
-		normChunk = function( s , l ) {
-			// normalize spaces; find floats not starting with '0', string or 0 if not defined (Clint Priest)
-			return ( ! s.match( ore ) || l === 1 ) && parseFloat( s ) || s.replace( snre , ' ' ).replace( sre , '' ) || 0 ;	// jshint ignore:line
-		} ,
-		oFxNcL , oFyNcL ;
-	// first try and sort Hex codes or Dates
-	if ( yD ) {
-		if ( xD < yD ) { return -1 ; }
-		else if ( xD > yD ) { return 1 ; }
+const CONTROL_CLASS = 1 ;
+const WORD_SEPARATOR_CLASS = 2 ;
+const LETTER_CLASS = 3 ;
+const NUMBER_CLASS = 4 ;
+const SYMBOL_CLASS = 5 ;
+
+
+
+function getCharacterClass( char , code ) {
+	if ( isWordSeparator( code ) ) { return WORD_SEPARATOR_CLASS ; }
+	if ( code <= 0x1f || code === 0x7f ) { return CONTROL_CLASS ; }
+	if ( isNumber( code ) ) { return NUMBER_CLASS ; }
+	// Here we assume that a letter is a char with a “case”
+	if ( char.toUpperCase() !== char.toLowerCase() ) { return LETTER_CLASS ; }
+	return SYMBOL_CLASS ;
+}
+
+
+
+function isWordSeparator( code ) {
+	if (
+		// space, tab, no-break space
+		code === 0x20 || code === 0x09 || code === 0xa0 ||
+		// hyphen, underscore
+		code === 0x2d || code === 0x5f
+	) {
+		return true ;
 	}
-	// natural sorting through split numeric strings and default strings
-	for( var cLoc = 0 , xNl = xN.length , yNl = yN.length , numS = Math.max( xNl , yNl ) ; cLoc < numS ; cLoc ++ ) {
-		oFxNcL = normChunk( xN[cLoc] , xNl ) ;
-		oFyNcL = normChunk( yN[cLoc] , yNl ) ;
-		// handle numeric vs string comparison - number < string - (Kyle Adams)
-		if ( isNaN( oFxNcL ) !== isNaN( oFyNcL ) ) { return ( isNaN( oFxNcL ) ) ? 1 : -1 ; }
-		// rely on string comparison if different types - i.e. '02' < 2 != '02' < '2'
-		else if ( typeof oFxNcL !== typeof oFyNcL ) {
-			oFxNcL += '' ;
-			oFyNcL += '' ;
+
+	return false ;
+}
+
+
+
+function isNumber( code ) {
+	if ( code >= 0x30 && code <= 0x39 ) { return true ; }
+	return false ;
+}
+
+
+
+function naturalSort( a , b ) {
+	a = '' + a ;
+	b = '' + b ;
+
+	var aIndex , aEndIndex , aChar , aCode , aClass , aCharLc , aNumber ,
+		aTrim = a.trim() ,
+		aLength = aTrim.length ,
+		bIndex , bEndIndex , bChar , bCode , bClass , bCharLc , bNumber ,
+		bTrim = b.trim() ,
+		bLength = bTrim.length ,
+		advantage = 0 ;
+
+	for ( aIndex = bIndex = 0 ; aIndex < aLength && bIndex < bLength ; aIndex ++ , bIndex ++ ) {
+		aChar = aTrim[ aIndex ] ;
+		bChar = bTrim[ bIndex ] ;
+		aCode = aTrim.charCodeAt( aIndex ) ;
+		bCode = bTrim.charCodeAt( bIndex ) ;
+		aClass = getCharacterClass( aChar , aCode ) ;
+		bClass = getCharacterClass( bChar , bCode ) ;
+		if ( aClass !== bClass ) { return aClass - bClass ; }
+
+		switch ( aClass ) {
+			case WORD_SEPARATOR_CLASS :
+				// Eat all white chars and continue
+				while ( isWordSeparator( aTrim.charCodeAt( aIndex + 1 ) ) ) { aIndex ++ ; }
+				while ( isWordSeparator( bTrim.charCodeAt( bIndex + 1 ) ) ) { bIndex ++ ; }
+				break ;
+
+			case CONTROL_CLASS :
+			case SYMBOL_CLASS :
+				if ( aCode !== bCode ) { return aCode - bCode ; }
+				break ;
+
+			case LETTER_CLASS :
+				aCharLc = aChar.toLowerCase() ;
+				bCharLc = bChar.toLowerCase() ;
+				if ( aCharLc !== bCharLc ) { return aCharLc > bCharLc ? 1 : -1 ; }
+
+				// As a last resort, we would sort uppercase first
+				if ( ! advantage && aChar !== bChar ) { advantage = aChar !== aCharLc ? -1 : 1 ; }
+
+				break ;
+
+			case NUMBER_CLASS :
+				// Lookup for a whole number and parse it
+				aEndIndex = aIndex + 1 ;
+				while ( isNumber( aTrim.charCodeAt( aEndIndex ) ) ) { aEndIndex ++ ; }
+				aNumber = parseFloat( aTrim.slice( aIndex , aEndIndex ) ) ;
+
+				bEndIndex = bIndex + 1 ;
+				while ( isNumber( bTrim.charCodeAt( bEndIndex ) ) ) { bEndIndex ++ ; }
+				bNumber = parseFloat( bTrim.slice( bIndex , bEndIndex ) ) ;
+
+				if ( aNumber !== bNumber ) { return aNumber - bNumber ; }
+
+				// As a last resort, we would sort the number with the less char first
+				if ( ! advantage && aEndIndex - aIndex !== bEndIndex - bIndex ) { advantage = ( aEndIndex - aIndex ) - ( bEndIndex - bIndex ) ; }
+
+				// Advance the index at the end of the number area
+				aIndex = aEndIndex - 1 ;
+				bIndex = bEndIndex - 1 ;
+				break ;
 		}
-		if ( oFxNcL < oFyNcL ) { return -1 ; }
-		if ( oFxNcL > oFyNcL ) { return 1 ; }
 	}
-	return 0 ;
-} ;
+
+	// If there was an “advantage”, use it now
+	if ( advantage ) { return advantage ; }
+
+	// Finally, sort by remaining char, or by trimmed length or by full length
+	return ( aLength - aIndex ) - ( bLength - bIndex ) || aLength - bLength || a.length - b.length ;
+}
+
+module.exports = naturalSort ;
 
diff --git a/lib/string.js b/lib/string.js
@@ -28,7 +28,7 @@
 
 
 
-var stringKit = {} ;
+const stringKit = {} ;
 module.exports = stringKit ;
 
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "string-kit",
-  "version": "0.12.7",
+  "version": "0.12.8",
   "engines": {
     "node": ">=6.0.0"
   },

diff --git a/test/string-test.js b/test/string-test.js
@@ -1063,6 +1063,37 @@ describe( "Fuzzy string matching" , () => {
 
 
 
+describe( "Natural sort" , () => {
+
+	it( "basic natural sort tests" , () => {
+		expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;
+
+		// Case insensitive
+		expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
+		expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;
+
+		// Uppercase first as a tie-breaker
+		expect( [ 'one' , 'One' , 'two' , 'Two' , 'Three' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'one' , 'Three' , 'three' , 'Two' , 'two' ] ) ;
+
+		// Lesser number first
+		expect( [ 'abc121' , 'abc17' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc17' , 'abc121' , 'abc134' ] ) ;
+
+		// White space / word separator insensitive
+		expect( [ '  One  ' , '   Two   ' , 'three' ].sort( string.naturalSort ) ).to.equal( [ '  One  ' , 'three' , '   Two   ' ] ) ;
+		expect( [ 'abc   121' , 'abc 17' , 'abc  12' , 'abc    134' ].sort( string.naturalSort ) ).to.equal( [ 'abc  12' , 'abc 17' , 'abc   121' , 'abc    134' ] ) ;
+		expect( [ 'a-123-a' , 'a_12_a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a_12_a' , 'a 18 a' , 'a-123-a' ] ) ;
+		expect( [ 'a_123_a' , 'a-12-a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a-12-a' , 'a 18 a' , 'a_123_a' ] ) ;
+
+		// Number with shorter char-width as a tie-breaker
+		expect( [ 'abc00012' , 'abc012' , 'abc017' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc012' , 'abc00012' , 'abc017' , 'abc134' ] ) ;
+
+		// Symbols
+		expect( [ ';+$' , '!:;,' , '“”' ].sort( string.naturalSort ) ).to.equal( [ "!:;," , ";+$" , "“”" ] ) ;
+	} ) ;
+} ) ;
+
+
+
 describe( "Misc" , () => {
 
 	it( ".resize()" , () => {
@@ -1071,12 +1102,6 @@ describe( "Misc" , () => {
 		expect( string.resize( 'bobby' , 8 ) ).to.be( 'bobby   ' ) ;
 	} ) ;
 
-	it( ".naturalSort()" , () => {
-		expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;
-		expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
-		expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;
-	} ) ;
-
 	it( ".occurrenceCount()" , () => {
 		expect( string.occurrenceCount( '' , '' ) ).to.be( 0 ) ;
 		expect( string.occurrenceCount( 'three' , '' ) ).to.be( 0 ) ;