Skip to content

Commit

Permalink
fix: normalize-url
Browse files Browse the repository at this point in the history
  • Loading branch information
pgagnidze committed Nov 17, 2023
1 parent c67dc7d commit b9ebf0f
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 8 deletions.
1 change: 0 additions & 1 deletion package.json
Expand Up @@ -85,7 +85,6 @@
"dependencies": {
"@aws-sdk/client-s3": "^3.451.0",
"@aws-sdk/lib-storage": "^3.451.0",
"normalize-url": "^8.0.0",
"super-regex": "^0.2.0",
"url-regex-safe": "^4.0.0",
"ajv": "^8.11.2",
Expand Down
3 changes: 2 additions & 1 deletion src/utils/geturls.util.ts
@@ -1,5 +1,6 @@
// https://github.com/sindresorhus/get-urls
import urlRegex from "url-regex-safe";
import normalizeUrl from "normalize-url";
import { normalizeUrl } from '../utils';
import { isMatch, matches } from "super-regex";

const getUrlsFromQueryParameters = (url: string) => {
Expand Down
3 changes: 2 additions & 1 deletion src/utils/index.ts
Expand Up @@ -9,4 +9,5 @@ export * from './error.util';
export * from './crypto.util';
export * from './other.util';
export * from './cli.util';
export * from './geturls.util'
export * from './geturls.util'
export * from './normalizeurl.util'
285 changes: 285 additions & 0 deletions src/utils/normalizeurl.util.ts
@@ -0,0 +1,285 @@
// https://github.com/sindresorhus/normalize-url
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
const DATA_URL_DEFAULT_MIME_TYPE = 'text/plain';
const DATA_URL_DEFAULT_CHARSET = 'us-ascii';

const testParameter = (name: string | undefined, filters: any[]) => filters.some(filter => filter instanceof RegExp ? filter.test(name as string) : filter === name);

const supportedProtocols = new Set([
'https:',
'http:',
'file:',
]);

const hasCustomProtocol = (urlString: string) => {
try {
const {protocol} = new URL(urlString);
return protocol.endsWith(':') && !supportedProtocols.has(protocol);
} catch {
return false;
}
};

const normalizeDataURL = (urlString: string, {stripHash}: any) => {
const match = /^data:(?<type>[^,]*?),(?<data>[^#]*?)(?:#(?<hash>.*))?$/.exec(urlString);

if (!match) {
throw new Error(`Invalid URL: ${urlString}`);
}

const groups = match.groups as { type?: string, data?: string, hash?: string }; // Add type assertion

let { type, data, hash } = groups || {}; // Use default empty object if groups is undefined
const mediaType = (type || '').split(';');
hash = stripHash ? '' : (hash || '');

let isBase64 = false;
if (mediaType[mediaType.length - 1] === 'base64') {
mediaType.pop();
isBase64 = true;
}

// Lowercase MIME type
const mimeType = mediaType.shift()?.toLowerCase() ?? '';
const attributes = mediaType
.map(attribute => {
let [key, value = ''] = attribute.split('=').map(string => string.trim());

// Lowercase `charset`
if (key === 'charset') {
value = value.toLowerCase();

if (value === DATA_URL_DEFAULT_CHARSET) {
return '';
}
}

return `${key}${value ? `=${value}` : ''}`;
})
.filter(Boolean);

const normalizedMediaType = [
...attributes,
];

if (isBase64) {
normalizedMediaType.push('base64');
}

if (normalizedMediaType.length > 0 || (mimeType && mimeType !== DATA_URL_DEFAULT_MIME_TYPE)) {
normalizedMediaType.unshift(mimeType);
}

return `data:${normalizedMediaType.join(';')},${isBase64 ? data?.trim() : data}${hash ? `#${hash}` : ''}`;
};

export function normalizeUrl(urlString: string, options: any) {
options = {
defaultProtocol: 'http',
normalizeProtocol: true,
forceHttp: false,
forceHttps: false,
stripAuthentication: true,
stripHash: false,
stripTextFragment: true,
stripWWW: true,
removeQueryParameters: [/^utm_\w+/i],
removeTrailingSlash: true,
removeSingleSlash: true,
removeDirectoryIndex: false,
removeExplicitPort: false,
sortQueryParameters: true,
...options,
};

// Legacy: Append `:` to the protocol if missing.
if (typeof options.defaultProtocol === 'string' && !options.defaultProtocol.endsWith(':')) {
options.defaultProtocol = `${options.defaultProtocol}:`;
}

urlString = urlString.trim();

// Data URL
if (/^data:/i.test(urlString)) {
return normalizeDataURL(urlString, options);
}

if (hasCustomProtocol(urlString)) {
return urlString;
}

const hasRelativeProtocol = urlString.startsWith('//');
const isRelativeUrl = !hasRelativeProtocol && /^\.*\//.test(urlString);

// Prepend protocol
if (!isRelativeUrl) {
urlString = urlString.replace(/^(?!(?:\w+:)?\/\/)|^\/\//, options.defaultProtocol);
}

const urlObject = new URL(urlString);

if (options.forceHttp && options.forceHttps) {
throw new Error('The `forceHttp` and `forceHttps` options cannot be used together');
}

if (options.forceHttp && urlObject.protocol === 'https:') {
urlObject.protocol = 'http:';
}

if (options.forceHttps && urlObject.protocol === 'http:') {
urlObject.protocol = 'https:';
}

// Remove auth
if (options.stripAuthentication) {
urlObject.username = '';
urlObject.password = '';
}

// Remove hash
if (options.stripHash) {
urlObject.hash = '';
} else if (options.stripTextFragment) {
urlObject.hash = urlObject.hash.replace(/#?:~:text.*?$/i, '');
}

// Remove duplicate slashes if not preceded by a protocol
// NOTE: This could be implemented using a single negative lookbehind
// regex, but we avoid that to maintain compatibility with older js engines
// which do not have support for that feature.
if (urlObject.pathname) {
// TODO: Replace everything below with `urlObject.pathname = urlObject.pathname.replace(/(?<!\b[a-z][a-z\d+\-.]{1,50}:)\/{2,}/g, '/');` when Safari supports negative lookbehind.

// Split the string by occurrences of this protocol regex, and perform
// duplicate-slash replacement on the strings between those occurrences
// (if any).
const protocolRegex = /\b[a-z][a-z\d+\-.]{1,50}:\/\//g;

let lastIndex = 0;
let result = '';
for (;;) {
const match = protocolRegex.exec(urlObject.pathname);
if (!match) {
break;
}

const protocol = match[0];
const protocolAtIndex = match.index;
const intermediate = urlObject.pathname.slice(lastIndex, protocolAtIndex);

result += intermediate.replace(/\/{2,}/g, '/');
result += protocol;
lastIndex = protocolAtIndex + protocol!.length;
}

const remnant = urlObject.pathname.slice(lastIndex, urlObject.pathname.length);
result += remnant.replace(/\/{2,}/g, '/');

urlObject.pathname = result;
}

// Decode URI octets
if (urlObject.pathname) {
try {
urlObject.pathname = decodeURI(urlObject.pathname);
} catch {}
}

// Remove directory index
if (options.removeDirectoryIndex === true) {
options.removeDirectoryIndex = [/^index\.[a-z]+$/];
}

if (Array.isArray(options.removeDirectoryIndex) && options.removeDirectoryIndex.length > 0) {
let pathComponents = urlObject.pathname.split('/');
const lastComponent = pathComponents[pathComponents.length - 1];

if (testParameter(lastComponent, options.removeDirectoryIndex)) {
pathComponents = pathComponents.slice(0, -1);
urlObject.pathname = pathComponents.slice(1).join('/') + '/';
}
}

if (urlObject.hostname) {
// Remove trailing dot
urlObject.hostname = urlObject.hostname.replace(/\.$/, '');

// Remove `www.`
if (options.stripWWW && /^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$/.test(urlObject.hostname)) {
// Each label should be max 63 at length (min: 1).
// Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
// Each TLD should be up to 63 characters long (min: 2).
// It is technically possible to have a single character TLD, but none currently exist.
urlObject.hostname = urlObject.hostname.replace(/^www\./, '');
}
}

// Remove query unwanted parameters
if (Array.isArray(options.removeQueryParameters)) {
// eslint-disable-next-line unicorn/no-useless-spread -- We are intentionally spreading to get a copy.
for (const key of [...urlObject.searchParams.keys()]) {
if (testParameter(key, options.removeQueryParameters)) {
urlObject.searchParams.delete(key);
}
}
}

if (!Array.isArray(options.keepQueryParameters) && options.removeQueryParameters === true) {
urlObject.search = '';
}

// Keep wanted query parameters
if (Array.isArray(options.keepQueryParameters) && options.keepQueryParameters.length > 0) {
// eslint-disable-next-line unicorn/no-useless-spread -- We are intentionally spreading to get a copy.
for (const key of [...urlObject.searchParams.keys()]) {
if (!testParameter(key, options.keepQueryParameters)) {
urlObject.searchParams.delete(key);
}
}
}

// Sort query parameters
if (options.sortQueryParameters) {
urlObject.searchParams.sort();

// Calling `.sort()` encodes the search parameters, so we need to decode them again.
try {
urlObject.search = decodeURIComponent(urlObject.search);
} catch {}
}

if (options.removeTrailingSlash) {
urlObject.pathname = urlObject.pathname.replace(/\/$/, '');
}

// Remove an explicit port number, excluding a default port number, if applicable
if (options.removeExplicitPort && urlObject.port) {
urlObject.port = '';
}

const oldUrlString = urlString;

// Take advantage of many of the Node `url` normalizations
urlString = urlObject.toString();

if (!options.removeSingleSlash && urlObject.pathname === '/' && !oldUrlString.endsWith('/') && urlObject.hash === '') {
urlString = urlString.replace(/\/$/, '');
}

// Remove ending `/` unless removeSingleSlash is false
if ((options.removeTrailingSlash || urlObject.pathname === '/') && urlObject.hash === '' && options.removeSingleSlash) {
urlString = urlString.replace(/\/$/, '');
}

// Restore relative protocol, if applicable
if (hasRelativeProtocol && !options.normalizeProtocol) {
urlString = urlString.replace(/^http:\/\//, '//');
}

// Remove http/https
if (options.stripProtocol) {
urlString = urlString.replace(/^(?:https?:)?\/\//, '');
}

return urlString;
}
5 changes: 0 additions & 5 deletions yarn.lock
Expand Up @@ -4202,11 +4202,6 @@ normalize-path@^3.0.0, normalize-path@~3.0.0:
resolved "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz"
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==

normalize-url@^8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.0.0.tgz#593dbd284f743e8dcf6a5ddf8fadff149c82701a"
integrity sha512-uVFpKhj5MheNBJRTiMZ9pE/7hD1QTeEvugSJW/OmLzAp78PB5O6adfMNTvmfKhXBkvCzC+rqifWcVYpGFwTjnw==

npm-run-path@^2.0.0:
version "2.0.2"
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-2.0.2.tgz#35a9232dfa35d7067b4cb2ddf2357b1871536c5f"
Expand Down

0 comments on commit b9ebf0f

Please sign in to comment.