New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pickle error when dealing with large corpora #20
Comments
Thanks. Pickling in general is a bad idea, so this will go away as I move
|
This is my quick workaround using diff --git a/glove/corpus.py b/glove/corpus.py
index 4545ba4..9fc3c45 100644
--- a/glove/corpus.py
+++ b/glove/corpus.py
@@ -1,11 +1,13 @@
# Cooccurrence matrix construction tools
# for fitting the GloVe model.
import numpy as np
+import scipy.sparse as sp
try:
# Python 2 compat
import cPickle as pickle
except ImportError:
import pickle
+import h5py
from .corpus_cython import construct_cooccurrence_matrix
@@ -70,10 +72,12 @@ class Corpus(object):
max_map_size)
def save(self, filename):
-
- with open(filename, 'wb') as savefile:
- pickle.dump((self.dictionary, self.matrix),
- savefile,
+
+ f = h5py.File('{0}.h5'.format(filename), 'w')
+ _ = f.create_dataset('matrix', data=self.matrix.todense())
+
+ with open('{0}.dict'.format(filename), 'wb') as savefile:
+ pickle.dump(self.dictionary, savefile,
protocol=pickle.HIGHEST_PROTOCOL)
@classmethod
@@ -81,7 +85,10 @@ class Corpus(object):
instance = cls()
- with open(filename, 'rb') as savefile:
- instance.dictionary, instance.matrix = pickle.load(savefile)
+ f = h5py.File('{0}.h5'.format(filename), 'r')
+ instance.matrix = sp.sparse.coo_matrix(f['matrix'])
+
+ with open('{0}.dict'.format(filename), 'rb') as savefile:
+ instance.dictionary = pickle.load(savefile)
return instance
diff --git a/glove/glove.py b/glove/glove.py
index c93f69e..f104e8f 100644
--- a/glove/glove.py
+++ b/glove/glove.py
@@ -11,9 +11,31 @@ except ImportError:
import numpy as np
import scipy.sparse as sp
+import h5py
from .glove_cython import fit_vectors, transform_paragraph
class Glove(object):
"""
@@ -189,6 +211,19 @@ class Glove(object):
Serialize model to filename.
"""
+ f = h5py.File('{0}.h5'.format(filename), 'w')
+ if isinstance(self.word_vectors, np.ndarray):
+ _ = f.create_dataset('word_vectors', data=self.word_vectors)
+ if isinstance(self.word_biases, np.ndarray):
+ _ = f.create_dataset('word_biases', data=self.word_biases)
+ if isinstance(self.vectors_sum_gradients, np.ndarray):
+ _ = f.create_dataset('vectors_sum_gradients', data=self.vectors_sum_gradients)
+ if isinstance(self.biases_sum_gradients, np.ndarray):
+ _ = f.create_dataset('biases_sum_gradients', data=self.biases_sum_gradients)
+
+ for field in ['word_vectors', 'word_biases', 'vectors_sum_gradients', 'biases_sum_gradients']:
+ self.__dict__.pop(field)
+
with open(filename, 'wb') as savefile:
pickle.dump(self.__dict__,
savefile,
@@ -205,6 +240,16 @@ class Glove(object):
with open(filename, 'rb') as savefile:
instance.__dict__ = pickle.load(savefile)
+ f = h5py.File('{0}.h5'.format(filename), 'r')
+ if 'word_vectors' in f.keys():
+ instance.word_vectors = np.asarray(f['word_vectors'])
+ if 'word_biases' in f.keys():
+ instance.word_biases = np.asarray(f['word_biases'])
+ if 'vectors_sum_gradients' in f.keys():
+ instance.vectors_sum_gradients = np.asarray(f['vectors_sum_gradients'])
+ if 'biases_sum_gradients' in f.keys():
+ instance.biases_sum_gradients = np.asarray(f['biases_sum_gradients'])
+
return instance
@classmethod |
I used python3.4 when processing large data using glove-python. pickling works fine. |
While this is not an issue with glove-python, it's worth noting that pickling large corpora/models causes the following error:
According to this numpy issue, it is a bug in pickle that has been fixed in Python 3.3.
I think it would be worth pointing that out on the README for future reference.
The text was updated successfully, but these errors were encountered: