Docs.

dimazest · Sep 23, 2014 · 5e18aa8 · 5e18aa8
1 parent eb098dc
commit 5e18aa8
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 22 deletions.
diff --git a/README.rst b/README.rst
@@ -35,20 +35,11 @@ This package provides an iterator over the dataset stored at Google. It
 decompresses the data on the fly and provides you the access to the underlying
 data.
 
+Features
+========
 
-Example use
-===========
-
->>> from google_ngram_downloader import readline_google_store
->>>
->>> fname, url, records = next(readline_google_store(ngram_len=5))
->>> fname
-'googlebooks-eng-all-5gram-20120701-0.gz'
->>> url
-'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-0.gz'
->>> next(records)
-Record(ngram=u'0 " A most useful', year=1860, match_count=1, volume_count=1)
-
+* Download ngrams of various length and languages.
+* Access to part of ngrams, e.g. ones that start with an 'a'.
 
 Installation
 ============
@@ -62,6 +53,28 @@ The command line tool
 =====================
 
 It also provides a simple command line tool to download the ngrams called
-`google-ngram-downloader`.
+`google-ngram-downloader`. Refer to the help to see available actions::
+
+    google-ngram-downloader help
+    usage: google-ngram-downloader <command> [options]
 
+    commands:
 
+     cooccurrence  Write the cooccurrence frequencies of a word and its contexts.
+     download      Download The Google Books Ngram Viewer dataset version 20120701.
+     help          Show help for a given help topic or a help overview.
+     readline      Print the raw content.
+
+
+Example use of the API
+======================
+
+>>> from google_ngram_downloader import readline_google_store
+>>>
+>>> fname, url, records = next(readline_google_store(ngram_len=5))
+>>> fname
+'googlebooks-eng-all-5gram-20120701-0.gz'
+>>> url
+'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-0.gz'
+>>> next(records)
+Record(ngram=u'0 " A most useful', year=1860, match_count=1, volume_count=1)
diff --git a/google_ngram_downloader/__main__.py b/google_ngram_downloader/__main__.py
@@ -18,7 +18,11 @@ def download(
     ngram_len=('n', 1, 'The length of ngrams to be downloaded.'),
     output=('o', 'downloads/google_ngrams/{ngram_len}', 'The destination folder for downloaded files.'),
     verbose=('v', False, 'Be verbose.'),
-    lang=('l', "eng", 'Language: eng'),
+    lang=(
+        'l',
+        'eng',
+        'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]',
+    ),
 ):
     """Download The Google Books Ngram Viewer dataset version 20120701."""
     output = local(output.format(ngram_len=ngram_len))
@@ -39,10 +43,18 @@ def cooccurrence(
     output=('o', 'downloads/google_ngrams/{ngram_len}_cooccurrence', 'The destination folder for downloaded files.'),
     verbose=('v', False, 'Be verbose.'),
     rewrite=('r', False, 'Always rewrite existing files.'),
-    records_in_file=('', 50000000, 'The number of records to be read from the Google store to store in a .json.gz file.'),
-    lang=('l', "eng", 'Language: eng'),
+    records_in_file=(
+        '',
+        50000000,
+        'The number of records to be read from the Google store to store in a .json.gz file.',
+    ),
+    lang=(
+        'l',
+        'eng',
+        'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]',
+    ),
 ):
-    """Write the cooccurrence frequncis of a word and its contexts."""
+    """Write the cooccurrence frequencies of a word and its contexts."""
     assert ngram_len > 1
     output_dir = local(output.format(ngram_len=ngram_len))
     output_dir.ensure_dir()
@@ -84,7 +96,11 @@ def cooccurrence(
 @command()
 def readline(
     ngram_len=('n', 2, 'The length of ngrams to be downloaded.'),
-    lang=('l', "eng", 'Language: eng'),
+    lang=(
+        'l',
+        'eng',
+        'Language. [eng|eng-us|eng-gb|eng-fiction|chi-sim|fre|ger|heb|ita|rus|spa]',
+    ),
 ):
     """Print the raw content."""
 

diff --git a/google_ngram_downloader/util.py b/google_ngram_downloader/util.py
@@ -20,9 +20,10 @@ def readline_google_store(ngram_len, lang='eng', indices=None, chunk_size=1024 *
     """Iterate over the data in the Google ngram collectioin.
 
         :param int ngram_len: the length of ngrams to be streamed.
+        :param str lang: the langueage of the ngrams.
+        :param iter indices: the file indices to be downloaded.
         :param int chunk_size: the size the chunks of raw compressed data.
-        :param bool verbose: if `True`, then the debug information is shown to
-        `sys.stderr`.
+        :param bool verbose: if `True`, then the debug information is shown to `sys.stderr`.
 
         :returns: a iterator over triples `(fname, url, records)`
 
@@ -89,7 +90,14 @@ def count_coccurrence(records, index):
 
 
 def iter_google_store(ngram_len, lang="eng", indices=None, verbose=False):
-    """Iterate over the collection files stored at Google."""
+    """Iterate over the collection files stored at Google.
+
+    :param int ngram_len: the length of ngrams to be streamed.
+    :param str lang: the langueage of the ngrams.
+    :param iter indices: the file indices to be downloaded.
+    :param bool verbose: if `True`, then the debug information is shown to `sys.stderr`.
+
+    """
     version = '20120701'
     session = requests.Session()