Merge pull request #2 from cubenlp/use-click

reconstruct using click
cubenlp · Mar 18, 2024 · 8493d8f · 8493d8f
2 parents dfacd24 + 928d46f
commit 8493d8f
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 108 deletions.
diff --git a/README-EN.md b/README-EN.md
@@ -1,6 +1,35 @@
-# hf-mirror-download
-A command-line tool designed to streamline the process of downloading machine learning models and related files from the Hugging Face model hub mirror site.
+# hf-mirror-fetch
 
-https://hf-mirror.com/
+[English](README-EN.md) | [简体中文](README.md)
 
+`hf-mirror-fetch` is a command-line tool designed to streamline the process of downloading machine learning models and related files from the Hugging Face model hub mirror site. This tool is particularly useful in environments where access to the primary Hugging Face repository is restricted or where connectivity to the main site is slow.
 
+## Installation
+
+Ensure you have Python 3.6 or higher installed. You can install `hf-mirror-fetch` using the following command:
+
+```bash
+pip install hf-mirror-fetch
+```
+
+## Usage
+
+Using `hf-mirror-fetch` to download models is straightforward; you only need to provide the URL of the model page. Here is an example of how to use it:
+
+```bash
+hfmf -u https://huggingface.co/openchat/openchat_3.5/tree/main
+```
+
+### Parameters
+
+- `-u`, `--url`: The URL of the model's page on hf-mirror.com or huggingface.co.
+- `-f`, `--tgt_folder`: (Optional) The target folder to save the downloaded files. If not specified, a folder named after the model will be created.
+- `--update`: (Optional) Update existing files except for weights.
+
+## Contributing
+
+Contributions of any kind are welcome, whether they are feature suggestions, code improvements, or documentation updates. Please submit your contributions through GitHub Issues or Pull Requests.
+
+## License
+
+This project is licensed under the MIT License. For more details, please check the LICENSE file.
diff --git a/README.md b/README.md
@@ -1,12 +1,37 @@
-# hf-mirror-download
-download huggingface models from mirror site
+# hf-mirror-fetch
 
-https://hf-mirror.com/
+[English](README-EN.md) | [简体中文](README.md)
 
-#  usage
-```python
-python mirror_download.py https://huggingface.co/microsoft/phi-2/tree/main
-# or
-python mirror_download.py https://hf-mirror.com/microsoft/phi-2/tree/main
+`hf-mirror-fetch` 是一个命令行工具，旨在简化从 Hugging Face 模型镜像站点下载机器学习模型和相关文件的过程。这个工具特别适用于在访问 Hugging Face 主站受限或连接速度较慢的环境中。
+
+## 安装
+
+确保您已安装 Python 3.6 或更高版本。您可以通过以下命令安装 `hf-mirror-fetch`：
+
+```bash
+pip install hf-mirror-fetch
+```
+
+## 使用方法
+
+使用 `hf-mirror-fetch` 下载模型非常简单，只需要提供模型页面的 URL 即可。以下是使用示例：
+
+```bash
+hfmf -u https://huggingface.co/openchat/openchat_3.5/tree/main
+hfmf -u https://huggingface.co/openchat/openchat_3.5
+hfmf -u openchat/openchat_3.5
 ```
-在网站上找到模型之后，打开 `Files and versions` Tab, 拷贝链接下载
+
+### 参数说明
+
+- `-u`, `--url`: 模型页面在 hf-mirror.com 或 huggingface.co 上的 URL。
+- `-f`, `--tgt_folder`: （可选）保存下载文件的目标文件夹。如果未指定，将使用模型名创建一个文件夹。
+- `--update`: （可选）更新现有文件，但不包括权重文件。
+
+## 贡献
+
+欢迎任何形式的贡献，无论是功能建议、代码改进还是文档更新。请通过 GitHub 的 Issue 或 Pull Request 提交您的贡献。
+
+## 版权
+
+该项目采用 MIT 许可证。有关更多信息，请查看 LICENSE 文件。
diff --git a/hf_mirror_fetch/cli.py b/hf_mirror_fetch/cli.py
@@ -1,16 +1,29 @@
 """Console script for hf_mirror_fetch."""
 import sys
 import click
+from hf_mirror_fetch.mirror_download import get_url2names, download_from_mirror_page
+
+def process_url(url):
+    ROOT = "https://hf-mirror.com"
+    HF = "https://huggingface.co"
+    # If URL lacks a known prefix, assume it's intended for Hugging Face
+    if not url.startswith((ROOT, HF)):
+        url = f"{HF}/{url}"  # Prepend with Hugging Face domain
+    if url.startswith(HF):
+        url = url.replace(HF, ROOT)
+    # Ensure URL ends with /tree/main, append if necessary
+    if not url.endswith("/tree/main"):
+        # Append correctly depending on whether URL already ends with a slash
+        url = f"{url.rstrip('/')}/tree/main"
+    return url
 
 
 @click.command()
-def main(args=None):
+@click.option('-u', '--url', required=True, help='The URL of the model\'s page on hf-mirror.com or huggingface.co.')
+@click.option('-f', '--tgt_folder', default=None, type=str, help='The target folder to save the downloaded files.')
+@click.option('--update', is_flag=True, help='Update existing files except for weights.')
+def main(url, tgt_folder, update):
     """Console script for hf_mirror_fetch."""
-    click.echo("Replace this message by putting your code into "
-               "hf_mirror_fetch.cli.main")
-    click.echo("See click documentation at https://click.palletsprojects.com/")
+    url = process_url(url)
+    download_from_mirror_page(url, tgt_folder, update)
     return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())  # pragma: no cover
diff --git a/hf_mirror_fetch/mirror_download.py b/hf_mirror_fetch/mirror_download.py
@@ -1,27 +1,21 @@
 # -*- coding: utf-8 -*-
-# @File    :   mirror_download.py
-# @Time    :   2023/12/03 21:56:47
-# @Author  :   Qing
-# @Email   :   aqsz2526@outlook.com
-######################### docstring ########################
-'''
-    由于 huggingface.co无法直接访问，所以从 hf-mirror.com 下载模型
+"""
+mirror_download.py
 
-    提供 file and version 页面的 url，下载所有文件
-    eg: https://huggingface.co/openchat/openchat_3.5/tree/main
+A tool to download models from hf-mirror.com, catering to situations where huggingface.co is inaccessible.
+This script provides functionalities to:
+1. Parse all file links and names from an HTML page.
+2. Retrieve complete file information in cases where not all items are loaded in a single go, through JSON.
+3. Use wget for downloading files, supporting resume for interrupted downloads.
 
-    1. 从 html 中解析出所有文件的链接和文件名
-    2. 对于一次性没有加载完的情况，从 json 中获取所有文件的信息, 也包括上面从 html 中解析出来的链接
-    3. 调用wget -c 下载所有文件，中断后可以继续下载
-
-'''
+Example usage:
+    python mirror_download.py --url https://huggingface.co/openchat/openchat_3.5/tree/main --tgt_folder my_model_folder --update
 
+Requirements:
+    requests, bs4, click
+"""
 import os
 import json
-try:
-  import fire
-except:
-  raise Exception("fire not installed, run `pip install fire`")
 import requests
 from tqdm import tqdm
 from bs4 import BeautifulSoup
@@ -31,15 +25,15 @@
 
 
 def get_next_page_items(soup, url):
-    """ 页面 item 数量太多，一次性没有加载完全，通过再次请求解析 json 得到完整文件列表
-        ! 当前实现假设只需要点击一次 `load more` 按钮即可加载完全
+    """ For pages with too many items to load all at once, parse the json from another request to get the complete file list.
+        ! This implementation assumes that clicking the `load more` button once is sufficient to load all items.
     """
     obj = soup.find_all('div',attrs={'data-target':"ViewerIndexTreeList"})
     data_props = json.loads(obj[0]['data-props'])
     current_items = data_props['entries']
 
     next_page_url = data_props['nextURL']
-    if next_page_url is not None:  # 有下一页
+    if next_page_url is not None:  # If there's a next page
         data = requests.get(f"{ROOT}{next_page_url}").json()
         all_items = current_items + data
     else:
@@ -51,17 +45,16 @@ def get_next_page_items(soup, url):
         if item['type'] == 'file':
             name = item['path']
             _url = f"{download_url}/{name}?download=true"
-            _url = quote(_url, safe=":/?=&")                    # 文件名中存在空格, 使用 quote 编码
+            _url = quote(_url, safe=":/?=&")  # Encode spaces in file names with quote
             url2names.append((_url, name))
     return url2names
 
-
 def save_with_wget(url, file):
     os.system(f"wget -c {url} -O {file}")
 
 def get_url2names(url):
     """
-        获取网页中的下载链接和文件名
+        Fetch download links and filenames from the webpage.
     """
 
     print("="*50)
@@ -71,27 +64,26 @@ def get_url2names(url):
     response = requests.get(url)
     soup = BeautifulSoup(response.text, 'html.parser')
 
-    # 从 html 中解析出所有文件的链接和文件名
+    # Parse all file links and names from the HTML
     a_tags = soup.find_all(title="Download file")
-    if len(a_tags) == 0 :
+    if len(a_tags) == 0:
         print("!"*80)
-        print("No files detected! please check the input url. NOT `model card`, use url for `Files and versions` that contains `/tree/main`.")
+        print("No files detected! Please check the input URL. Use the URL for `Files and versions` that contains `/tree/main`, not `model card`.")
         print("Exiting ...")
         exit()
 
-    # unquote: 文件名不需要url encoding
+    # unquote: filenames do not need URL encoding
     url2names = [
         (ROOT + a_tag['href'], unquote(a_tag['href'].replace('?download=true', '').split('resolve/main/')[-1]))
         for a_tag in a_tags
     ]
 
-    # 从 json 中获取所有文件的信息, 包括上面从 html 中解析出来的链接
+    # Retrieve all file information from JSON, including the links parsed from HTML
     all_url2names = get_next_page_items(soup, url)
     for item in url2names:
-        assert item in all_url2names, f"{item} not in all_url2names" # 验证一下两者的一致性
-
+        assert item in all_url2names, f"{item} not in all_url2names"  # Verify consistency
 
-    # 覆盖原来的 url2names 并打印出来
+    # Overwrite original url2names and print
     url2names = all_url2names
     print("="*50)
     for _url, name in url2names:
@@ -100,31 +92,22 @@ def get_url2names(url):
     print(f"{len(url2names)} files in total!")
     return url2names
 
-
-
 def download_from_mirror_page(URL, tgt_folder=None, update=True):
-    """从hf-mirror.com下载模型
+    """Download models from hf-mirror.com
 
     Args:
-        url: 模型链接 huggingface.co 或者 hf-mirror.com 的 resolve/main/ 页面
-        tgt_folder: 保存路径. Defaults to None.
-        update: 仓库有更新的话，更新所有权重以外的文件. Defaults to True. || todo wget -c 好像已经实现了这个功能
+        url: Model link from huggingface.co or hf-mirror.com's resolve/main/ page
+        tgt_folder: Save path. Defaults to None.
+        update: If the repository is updated, update all files except the weights. Defaults to True. || todo: wget -c seems to already implement this functionality
     """
-    # 检查 url 是否正确 并修改
-    if not URL.startswith(ROOT):
-        assert URL.startswith("https://huggingface.co"), "make sure download from hf-mirror.com or huggingface.com"
-        URL = URL.replace("huggingface.co", "hf-mirror.com")
-
-    # 从 url 中解析出所有文件的链接和文件名
+    # Parse all file links and names from the URL
     url2names = get_url2names(URL)
 
-    # import pdb;pdb.set_trace();
     if tgt_folder is None:
         tgt_folder = URL.replace(ROOT+"/", '').replace('/tree/main', '')
-
         tgt_folder = os.path.join(".", tgt_folder)
 
-    # 判断文件夹是否存在，尝试创建
+    # Check if the folder exists, try to create
     try:
         if os.path.exists(tgt_folder):
             print(f"Folder {tgt_folder} already exists !!! continue download will overwrite the files in it. ")
@@ -133,21 +116,21 @@ def download_from_mirror_page(URL, tgt_folder=None, update=True):
         print(e)
         print("Failed to create folder! Check if the model has been downloaded. Exiting ...")
         exit()
-    # 是否继续下载
-    flag = input(f'saving to {tgt_folder}\n Continue downloading? Y/N\n').strip().lower()
+    # Confirm continuation of download
+    flag = input(f'Saving to {tgt_folder}\n Continue downloading? Y/N\n').strip().lower()
     if flag == 'y':
         pass
     elif flag == 'n':
         print("Canceled. Exiting ...")
-        exit()
+        return None
     else:
-        raise Exception("y or n")
+        raise Exception("Please answer y or n")
 
 
-    # download part
+    # Download part
     for url, name in url2names:
         if url.endswith(".h5") or url.endswith(".ot") or url.endswith(".msgpack"):
-            # 一般只下载 .bin 和 .safetensors
+            # Generally, only download .bin and .safetensors
             continue
 
         tgt_path = os.path.join(tgt_folder, name)
@@ -158,10 +141,3 @@ def download_from_mirror_page(URL, tgt_folder=None, update=True):
         save_with_wget(url, tgt_path)
         # save_file(url, os.path.join(tgt_folder, name))
         # save_file_with_resume(url, os.path.join(tgt_folder, name))
-
-if __name__ == '__main__':
-    fire.Fire(download_from_mirror_page)
-
-    # get_url2names('https://huggingface.co/openchat/openchat_3.5/tree/main') # no next page
-    # get_url2names('https://huggingface.co/Qwen/Qwen-72B-Chat/tree/main')       # 99 items
-    # download_from_mirror_page("https://huggingface.co/microsoft/phi-1_5/tree/main")
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
@@ -9,7 +9,11 @@
 with open('README.md') as readme_file:
     readme = readme_file.read()
 
-requirements = ['Click>=7.0', ]
+requirements = [
+    'Click>=7.0',
+    'requests',
+    'beautifulsoup4'
+]
 
 test_requirements = ['pytest>=3', ]
 
@@ -30,7 +34,7 @@
     description="A command-line tool designed to streamline the process of downloading machine learning models and related files from the Hugging Face model hub mirror site.",
     entry_points={
         'console_scripts': [
-            'hf_mirror_fetch=hf_mirror_fetch.cli:main',
+            'hfmf=hf_mirror_fetch.cli:main',
         ],
     },
     install_requires=requirements,

diff --git a/tests/test_hf_mirror_fetch.py b/tests/test_hf_mirror_fetch.py
@@ -26,11 +26,17 @@ def test_content(response):
 
 
 def test_command_line_interface():
-    """Test the CLI."""
+    """Test the Command Line Interface (CLI)."""
     runner = CliRunner()
-    result = runner.invoke(cli.main)
-    assert result.exit_code == 0
-    assert 'hf_mirror_fetch.cli.main' in result.output
+
+    # Mock URL for testing
+    mock_url = "https://huggingface.co/openchat/openchat_3.5/tree/main"
+
+    # Testing the main command with required URL argument
+    result = runner.invoke(cli.main, ['--url', mock_url], input="N\n")
+    print(result.output)  # To help debugging
+    assert result.exit_code == 0, f"Command exited with {result.exit_code} and output: {result.output}"
+
+    # Testing the help option to ensure CLI setup is correct
     help_result = runner.invoke(cli.main, ['--help'])
-    assert help_result.exit_code == 0
-    assert '--help  Show this message and exit.' in help_result.output
+    assert help_result.exit_code == 0, "Help command failed"