-
Notifications
You must be signed in to change notification settings - Fork 0
/
uncdn.py
executable file
·133 lines (103 loc) · 4.47 KB
/
uncdn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
import os
import re
from argparse import ArgumentParser
OUTPUT_FOLDERNAME = 'external'
OUTPUT_LINKS_FILENAME = 'links.txt'
DEFAULT_EXCLUDED_SCRAPE = ['.git', '.vscode', 'bower_components', 'node_modules', OUTPUT_FOLDERNAME]
DEFAULT_EXCLUDED_INTERNALIZE = DEFAULT_EXCLUDED_SCRAPE + [OUTPUT_LINKS_FILENAME]
DEFAULT_EXTERNAL_ASSETS_REGEX = r'(http(s)?:)?//(\w|\.|/|:|\-|\d|=|\?)+\.(svg|png|jpg|jpeg|js|css|ico|gif)'
def get_file_list(directories_to_skip):
files_to_scan = []
for dirpath, _, filenames in os.walk('.'):
omit = False
for skippable_dir in directories_to_skip:
if skippable_dir in dirpath:
omit = True
if omit:
continue
for filename in filenames:
files_to_scan.append(os.path.join(dirpath, filename))
return files_to_scan
def external_urls(filename, url_pattern):
print('Processing file %s...' % filename)
with open(filename) as f:
for line in f.readlines():
match = url_pattern.search(line)
if match:
yield add_prefix_if_needed(match.group(0))
def add_prefix_if_needed(url):
return 'http:' + url if url.startswith('//') else url
def scrape(args):
directories_to_skip = args.exclude
url_pattern = re.compile(args.pattern)
# Get all external asset urls
urls = []
for filename in get_file_list(directories_to_skip):
urls.extend(external_urls(filename, url_pattern))
urls = sorted(set(urls))
if len(urls) > 0:
# Save file list
with open(OUTPUT_LINKS_FILENAME, 'w') as file_list:
for url in urls:
file_list.write(url + "\n")
print('Wrote url ' + url)
else:
print('No external urls found. Are you running this from the right folder?')
def download_assets(args):
if not os.path.exists(OUTPUT_FOLDERNAME):
os.mkdir(OUTPUT_FOLDERNAME)
os.chdir(OUTPUT_FOLDERNAME)
# TODO: Switch to a more portable approach
os.system('wget -c -i ../' + OUTPUT_LINKS_FILENAME)
os.chdir('..')
def internalize(args):
data = []
directories_to_skip = args.exclude
with open(OUTPUT_LINKS_FILENAME) as file_list:
for external_url in file_list.readlines():
filename = external_url.strip().split('/')[-1]
current_file_was_downloaded = os.path.exists(os.path.join(OUTPUT_FOLDERNAME, filename))
data.append((external_url.strip(), filename, current_file_was_downloaded))
for filename in get_file_list(directories_to_skip):
if not filename.endswith('.tmp'):
print('Processing %s...' % filename)
with open(filename, 'r') as infile, open(filename + ".tmp", 'w') as outfile:
content = infile.read()
for datum in data:
if datum[2]:
new_internal_path = '/%s/%s' % (OUTPUT_FOLDERNAME, datum[1])
external_url = datum[0]
# print('replacing %s for %s ' % (external_url, new_internal_path))
content = content.replace(external_url, new_internal_path)
# for // urls
content = content.replace(external_url[5:], new_internal_path)
outfile.write(content)
os.rename(filename + '.tmp', filename)
def main():
parser = ArgumentParser(
description='A simple external assets downloader from source code',
)
subparsers = parser.add_subparsers(
title='subcommands',
description='Valid subcommands',
help='Usually you want to run `scrape` first, `download` the assets, and then `internalize` the project',
dest='parser'
)
subparsers.required = True
# parser for 'scrape' action
scrape_parser = subparsers.add_parser('scrape')
scrape_parser.add_argument('--exclude', default=DEFAULT_EXCLUDED_SCRAPE)
scrape_parser.add_argument('--pattern', default=DEFAULT_EXTERNAL_ASSETS_REGEX)
scrape_parser.set_defaults(func=scrape)
# parser for 'download' action
download_parser = subparsers.add_parser('download')
download_parser.set_defaults(func=download_assets)
# parser for 'internalize' action
internalize_parser = subparsers.add_parser('internalize')
internalize_parser.add_argument('--exclude', default=DEFAULT_EXCLUDED_INTERNALIZE)
internalize_parser.set_defaults(func=internalize)
args = parser.parse_args()
args.func(args)
if __name__ == '__main__':
main()