Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing serialization error and span label issue #85

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

makcedward
Copy link

@makcedward makcedward commented Oct 30, 2023

This PR is going to fix two issues when using this library

  • Unable to serialize the list of Tuple variables. Converting it to list of dict
File [~/.venv/lib/python3.9/site-packages/spacy/tokens/_serialize.py:154], in DocBin.get_docs(self, vocab)
    152 # backwards-compatibility: may be b'' or serialized empty list
    153 if self.span_groups[i] and self.span_groups[i] != SpanGroups._EMPTY_BYTES:
--> 154     doc.spans.from_bytes(self.span_groups[i])
    155 else:
    156     doc.spans.clear()

File [~/.venv/lib/python3.9/site-packages/spacy/tokens/_dict_proxies.py:97], in SpanGroups.from_bytes(self, bytes_data)
     95 else:
     96     for value_bytes, keys in msg.items():
---> 97         group = SpanGroup(doc).from_bytes(value_bytes)
     98         # Deserialize `SpanGroup`s as copies because it's possible for two
     99         # different `SpanGroup`s (pre-serialization) to have the same bytes
    100         # (since they can have the same `.name`).
    101         self[keys[0]] = group

File [~/.venv/lib/python3.9/site-packages/spacy/tokens/span_group.pyx:238], in spacy.tokens.span_group.SpanGroup.from_bytes()

File [~/.venv/lib/python3.9/site-packages/srsly/_msgpack_api.py:27], in msgpack_loads(data, use_list)
     25 # msgpack-python docs suggest disabling gc before unpacking large messages
     26 gc.disable()
---> 27 msg = msgpack.loads(data, raw=False, use_list=use_list)
     28 gc.enable()
     29 return msg

File [~/.venv/lib/python3.9/site-packages/srsly/msgpack/__init__.py:79], in unpackb(packed, **kwargs)
     77         object_hook = functools.partial(decoder, chain=object_hook)
     78     kwargs["object_hook"] = object_hook
---> 79 return _unpackb(packed, **kwargs)

File [~/.venv/lib/python3.9/site-packages/srsly/msgpack/_unpacker.pyx:191], in srsly.msgpack._unpacker.unpackb()

TypeError: unhashable type: 'list'
  • Unable to recongize spaCy's span label. Using span.label instead of span.label_
File [~/.venv/lib/python3.9/site-packages/skweak/aggregation.py:126], in AbstractAggregator.fit(self, docs, **kwargs)
    121 """Fits the parameters of the aggregator model based on a collection
    122 of documents. The method extracts a dataframe of observations for
    123 each document and calls the _fit method"""
    125 obs_generator = (self.get_observation_df(doc) for doc in docs)
--> 126 self._fit(obs_generator, **kwargs)

File [~/.venv/lib/python3.9/site-packages/skweak/generative.py:98], in GenerativeModelMixin._fit(self, all_obs, cutoff, n_iter, tol)
     95 self._reset_counts(sources)
     97 # And add the counts from majority voter
---> 98 self._add_mv_counts(all_obs)
    100 # Finally, we postprocess the counts and get probabilities
    101 self._do_mstep_latent()

File [~/.venv/lib/python3.9/site-packages/skweak/generative.py:419], in NaiveBayes._add_mv_counts(self, all_obs)
    416 mv.label_groups = self.label_groups
    417 for obs in all_obs:
    418     # And aggregate the results
--> 419     agg_array = mv.aggregate(obs).values
    421     if len(agg_array)==0:
    422         continue

File [~/.venv/lib/python3.9/site-packages/skweak/voting.py:51], in MajorityVoterMixin.aggregate(self, obs)
     48 def count_fun(x):
     49     return np.bincount(x[x>=0], weights=weights[x>=0], 
     50                        minlength=len(self.observed_labels)) 
---> 51 label_votes = np.apply_along_axis(count_fun, 1, obs.values).astype(np.float32)
     53 # For token-level sequence labelling, we need to normalise the number 
     54 # of "O" occurrences, since they both indicate the absence of 
     55 # prediction, but are also a possible output
     56 if self.observed_labels[0]=="O":

File <__array_function__ internals>:180, in apply_along_axis(*args, **kwargs)

File [~/.venv/lib/python3.9/site-packages/numpy/lib/shape_base.py:376], in apply_along_axis(func1d, axis, arr, *args, **kwargs)
    374     ind0 = next(inds)
    375 except StopIteration as e:
--> 376     raise ValueError(
    377         'Cannot apply_along_axis when any iteration dimensions are 0'
    378     ) from None
    379 res = asanyarray(func1d(inarr_view[ind0], *args, **kwargs))
    381 # build a buffer for storing evaluations of func1d.
    382 # remove the requested axis, and add the new ones on the end.
    383 # laid out so that each write is contiguous.
    384 # for a tuple index inds, buff[inds] = func1d(inarr_view[inds])

ValueError: Cannot apply_along_axis when any iteration dimensions are 0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

1 participant