Merge branch 'release-3.7.3'

piskvorky · May 6, 2019 · 69877c5 · 69877c5
2 parents 7631b3e + d2634a5
commit 69877c5
Show file tree

Hide file tree

Showing 33 changed files with 4,187 additions and 3,003 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,51 @@
 Changes
-===========
+=======
+
+## 3.7.3, 2019-05-06
+
+### :red_circle: Bug fixes
+
+* Fix fasttext model loading from gzip files ([mpenkov](https://api.github.com/users/mpenkov), [#2476](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2476))
+* Fix misleading Doc2Vec.docvecs comment ([gojomo](https://api.github.com/users/gojomo), [#2472](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2472))
+* Nmf bugfix ([mpenkov](https://api.github.com/users/mpenkov), [#2466](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2466))
+* Fix WordEmbeddingsKeyedVectors.most_similar ([Witiko](https://api.github.com/users/Witiko), [#2461](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2461))
+* fix backwards compatibility ([mpenkov](https://api.github.com/users/mpenkov), [#2457](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2457))
+* Fix Lda Sequence model by updating to num_documents ([Bharat123rox](https://api.github.com/users/Bharat123rox), [#2410](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2410))
+* Make termsim matrix positive definite even with negative similarities ([Witiko](https://api.github.com/users/Witiko), [#2397](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2397))
+* Fix the off-by-one bug in the TFIDF model. ([AMR-KELEG](https://api.github.com/users/AMR-KELEG), [#2392](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2392))
+* update legacy model loading, fix #2453 ([mpenkov](https://api.github.com/users/mpenkov), [#2454](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2454))
+* Make matutils.unitvec always return float norm when requested ([Witiko](https://api.github.com/users/Witiko), [#2419](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2419))
+
+### :books: Tutorial and doc improvements
+
+* Update word2vec.ipynb ([asyabo](https://api.github.com/users/asyabo), [#2423](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2423))
+
+### :+1: Improvements
+
+* Adding type check for corpus_file argument ([saraswatmks](https://api.github.com/users/saraswatmks), [#2469](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2469))
+* Clean up FastText Cython code, fix division by zero ([mpenkov](https://api.github.com/users/mpenkov), [#2382](https://api.github.com/repos/RaRe-Technologies/gensim/pulls/2382))
+
+### :warning: Deprecations (will be removed in the next major release)
+
+* Remove
+    - `gensim.models.FastText.load_fasttext_format`: use load_facebook_vectors to load embeddings only (faster, less CPU/memory usage, does not support training continuation) and load_facebook_model to load full model (slower, more CPU/memory intensive, supports training continuation)
+    - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation)
+    - `gensim.examples`
+    - `gensim.nosy`
+    - `gensim.scripts.word2vec_standalone`
+    - `gensim.scripts.make_wiki_lemma`
+    - `gensim.scripts.make_wiki_online`
+    - `gensim.scripts.make_wiki_online_lemma`
+    - `gensim.scripts.make_wiki_online_nodebug`
+    - `gensim.scripts.make_wiki` (all of these obsoleted by the new native  `gensim.scripts.segment_wiki` implementation)
+    - "deprecated" functions and attributes
+
+* Move
+    - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py`
+    - `gensim.summarization` ➡ `gensim.models.summarization`
+    - `gensim.topic_coherence` ➡ `gensim.models._coherence`
+    - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work)
+    - `gensim.parsing.*` ➡ `gensim.utils.text_utils`
 
 ## 3.7.2, 2019-04-06
 
@@ -22,7 +68,7 @@ Changes
 
 ### :+1: Improvements
 
-* Undo the hash2index optimization (__[mpenkov](https://github.com/mpenkov)__, [#2370](https://github.com/RaRe-Technologies/gensim/pull/2387))
+* Undo the hash2index optimization (__[mpenkov](https://github.com/mpenkov)__, [#2370](https://github.com/RaRe-Technologies/gensim/pull/2370))
 
 ### :warning: Changes in FastText behavior
 

diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb
@@ -116,7 +116,7 @@
     " \n",
     "    def __iter__(self):\n",
     "        for fname in os.listdir(self.dirname):\n",
-    "            for line in smart_open(os.path.join(self.dirname, fname), 'rb'):\n",
+    "            for line in smart_open(os.path.join(self.dirname, fname), 'r'):\n",
     "                yield line.split()"
    ]
   },

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -57,7 +57,7 @@
 # The short X.Y version.
 version = '3.7'
 # The full version, including alpha/beta/rc tags.
-release = '3.7.2'
+release = '3.7.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -5,7 +5,7 @@
 from gensim import parsing, corpora, matutils, interfaces, models, similarities, summarization, utils  # noqa:F401
 import logging
 
-__version__ = '3.7.2'
+__version__ = '3.7.3'
 
 
 logger = logging.getLogger('gensim')

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -734,15 +734,18 @@ def unitvec(vec, norm='l2', return_norm=False):
                 return vec
         else:
             if return_norm:
-                return vec, 1.
+                return vec, 1.0
             else:
                 return vec
 
     if isinstance(vec, np.ndarray):
         if norm == 'l1':
             veclen = np.sum(np.abs(vec))
         if norm == 'l2':
-            veclen = blas_nrm2(vec)
+            if vec.size == 0:
+                veclen = 0.0
+            else:
+                veclen = blas_nrm2(vec)
         if veclen > 0.0:
             if np.issubdtype(vec.dtype, np.integer):
                 vec = vec.astype(np.float)
@@ -752,14 +755,17 @@ def unitvec(vec, norm='l2', return_norm=False):
                 return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
         else:
             if return_norm:
-                return vec, 1
+                return vec, 1.0
             else:
                 return vec
 
     try:
         first = next(iter(vec))  # is there at least one element?
     except StopIteration:
-        return vec
+        if return_norm:
+            return vec, 1.0
+        else:
+            return vec
 
     if isinstance(first, (tuple, list)) and len(first) == 2:  # gensim sparse format
         if norm == 'l1':

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
@@ -31,6 +31,7 @@
 
 import codecs
 import collections
+import gzip
 import io
 import logging
 import struct
@@ -74,6 +75,14 @@
     ('t', 'd'),
 ]
 
+_FLOAT_SIZE = struct.calcsize('@f')
+if _FLOAT_SIZE == 4:
+    _FLOAT_DTYPE = np.dtype(np.float32)
+elif _FLOAT_SIZE == 8:
+    _FLOAT_DTYPE = np.dtype(np.float64)
+else:
+    _FLOAT_DTYPE = None
+
 
 def _yield_field_names():
     for name, _ in _OLD_HEADER_FORMAT + _NEW_HEADER_FORMAT:
@@ -220,24 +229,65 @@ def _load_matrix(fin, new_format=True):
         The number of columns of the array will correspond to the vector size.
 
     """
+    if _FLOAT_DTYPE is None:
+        raise ValueError('bad _FLOAT_SIZE: %r' % _FLOAT_SIZE)
+
     if new_format:
         _struct_unpack(fin, '@?')  # bool quant_input in fasttext.cc
 
     num_vectors, dim = _struct_unpack(fin, '@2q')
+    count = num_vectors * dim
 
-    float_size = struct.calcsize('@f')
-    if float_size == 4:
-        dtype = np.dtype(np.float32)
-    elif float_size == 8:
-        dtype = np.dtype(np.float64)
+    #
+    # numpy.fromfile doesn't play well with gzip.GzipFile as input:
+    #
+    # - https://github.com/RaRe-Technologies/gensim/pull/2476
+    # - https://github.com/numpy/numpy/issues/13470
+    #
+    # Until they fix it, we have to apply a workaround.  We only apply the
+    # workaround when it's necessary, because np.fromfile is heavily optimized
+    # and very efficient (when it works).
+    #
+    if isinstance(fin, gzip.GzipFile):
+        logger.warning(
+            'Loading model from a compressed .gz file.  This can be slow. '
+            'This is a work-around for a bug in NumPy: https://github.com/numpy/numpy/issues/13470. '
+            'Consider decompressing your model file for a faster load. '
+        )
+        matrix = _fromfile(fin, _FLOAT_DTYPE, count)
     else:
-        raise ValueError("Incompatible float size: %r" % float_size)
+        matrix = np.fromfile(fin, _FLOAT_DTYPE, count)
 
-    matrix = np.fromfile(fin, dtype=dtype, count=num_vectors * dim)
+    assert matrix.shape == (count,), 'expected (%r,),  got %r' % (count, matrix.shape)
     matrix = matrix.reshape((num_vectors, dim))
     return matrix
 
 
+def _batched_generator(fin, count, batch_size=1e6):
+    """Read `count` floats from `fin`.
+
+    Batches up read calls to avoid I/O overhead.  Keeps no more than batch_size
+    floats in memory at once.
+
+    Yields floats.
+
+    """
+    while count > batch_size:
+        batch = _struct_unpack(fin, '@%df' % batch_size)
+        for f in batch:
+            yield f
+        count -= batch_size
+
+    batch = _struct_unpack(fin, '@%df' % count)
+    for f in batch:
+        yield f
+
+
+def _fromfile(fin, dtype, count):
+    """Reimplementation of numpy.fromfile."""
+    return np.fromiter(_batched_generator(fin, count), dtype=dtype)
+
+
 def load(fin, encoding='utf-8', full_model=True):
     """Load a model from a binary stream.
 

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -70,7 +70,7 @@
 except ImportError:
     from Queue import Queue  # noqa:F401
 
-from collections import namedtuple, defaultdict
+from collections import namedtuple, defaultdict, Iterable
 from timeit import default_timer
 
 from numpy import zeros, float32 as REAL, empty, ones, \
@@ -447,18 +447,13 @@ class Doc2Vec(BaseWordEmbeddingsModel):
         directly to query those embeddings in various ways. See the module level docstring for examples.
 
     docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
-        This object contains the paragraph vectors. Remember that the only difference between this model and
-        :class:`~gensim.models.word2vec.Word2Vec` is that besides the word vectors we also include paragraph embeddings
-        to capture the paragraph.
+        This object contains the paragraph vectors learned from the training data. There will be one such vector
+        for each unique document tag supplied during training. They may be individually accessed using the tag
+        as an indexed-access key. For example, if one of the training documents used a tag of 'doc003':
 
-        In this way we can capture the difference between the same word used in a different context.
-        For example we now have a different representation of the word "leaves" in the following two sentences ::
-
-            1. Manos leaves the office every day at 18:00 to catch his train
-            2. This season is called Fall, because leaves fall from the trees.
+        .. sourcecode:: pycon
 
-        In a plain :class:`~gensim.models.word2vec.Word2Vec` model the word would have exactly the same representation
-        in both sentences, in :class:`~gensim.models.doc2vec.Doc2Vec` it will not.
+            >>> model.docvecs['doc003']
 
     vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab`
         This object represents the vocabulary (sometimes called Dictionary in gensim) of the model.
@@ -794,6 +789,19 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
 
         """
         kwargs = {}
+
+        if corpus_file is None and documents is None:
+            raise TypeError("Either one of corpus_file or documents value must be provided")
+
+        if corpus_file is not None and documents is not None:
+            raise TypeError("Both corpus_file and documents must not be provided at the same time")
+
+        if documents is None and not os.path.isfile(corpus_file):
+            raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file)
+
+        if documents is not None and not isinstance(documents, Iterable):
+            raise TypeError("documents must be an iterable of list, got %r instead" % documents)
+
         if corpus_file is not None:
             # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file)
             offsets, start_doctags = self._get_offsets_and_start_doctags_for_corpusfile(corpus_file, self.workers)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -280,10 +280,12 @@
 """
 
 import logging
+import os
 
 import numpy as np
 from numpy import ones, vstack, float32 as REAL, sum as np_sum
 import six
+from collections import Iterable
 
 import gensim.models._fasttext_bin
 
@@ -901,6 +903,19 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
             >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
 
         """
+
+        if corpus_file is None and sentences is None:
+            raise TypeError("Either one of corpus_file or sentences value must be provided")
+
+        if corpus_file is not None and sentences is not None:
+            raise TypeError("Both corpus_file and sentences must not be provided at the same time")
+
+        if sentences is None and not os.path.isfile(corpus_file):
+            raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file)
+
+        if sentences is not None and not isinstance(sentences, Iterable):
+            raise TypeError("sentences must be an iterable of list, got %r instead" % sentences)
+
         super(FastText, self).train(
             sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
@@ -1023,30 +1038,22 @@ def load(cls, *args, **kwargs):
         """
         try:
             model = super(FastText, cls).load(*args, **kwargs)
-            if hasattr(model.wv, 'hash2index'):
-                gensim.models.keyedvectors._rollback_optimization(model.wv)
 
             if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
                 model.trainables.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL)
             if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
                 model.trainables.vectors_ngrams_lockf = ones(model.wv.vectors_ngrams.shape, dtype=REAL)
 
-            if not hasattr(model.wv, 'compatible_hash'):
-                logger.warning(
-                    "This older model was trained with a buggy hash function. "
-                    "The model will continue to work, but consider training it "
-                    "from scratch."
-                )
-                model.wv.compatible_hash = False
-
             if not hasattr(model.wv, 'bucket'):
                 model.wv.bucket = model.trainables.bucket
-
-            return model
         except AttributeError:
             logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
             from gensim.models.deprecated.fasttext import load_old_fasttext
-            return load_old_fasttext(*args, **kwargs)
+            model = load_old_fasttext(*args, **kwargs)
+
+        gensim.models.keyedvectors._try_upgrade(model.wv)
+
+        return model
 
     @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
     def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):