Number of comments are changing after embedding #116

TahaMunir1 · 2020-01-01T15:50:18Z

I am trying to embed some strings, but facing an issue. When passing a list of string containing different language (like French), the imput and output counts are not matching. Like if I pass 4 French strings, I get output of shape (2,1024). In some cases the output increases. It works fine for English, but emoticons or different language characters are resulting in this issue. Any help?

from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply, SplitLines, JoinEmbed
from indexing import IndexCreate

def lines_to_index(lang: str, lines: List, model_path: str, bpe_code_path: str, use_cpu: bool = False, batch_size: int = 32):
    """Suitable for small amounts of data."""
    with tempfile.TemporaryDirectory() as tmpdirname:
        target = str(Path(tmpdirname) / "source")
        with open(target, "w") as fout:
            fout.write("\n".join(lines))
        return text_file_pipeline(
            lang, target, model_path, bpe_code_path, use_cpu, returns="index", batch_size=batch_size
        )

def text_file_pipeline(lang: str, input_path: str, model_path: str, bpe_code_path: str, use_cpu: bool, batch_size: int,  returns="index"):
    """Suitable for small amounts of data."""
    encoder = SentenceEncoder(
        model_path,
        max_sentences=batch_size,
        max_tokens=10000,
        cpu=use_cpu)
        
    with tempfile.TemporaryDirectory() as tmpdirname:
        tmpdir = Path(tmpdirname)
        Token(
            input_path,
            str(tmpdir / "token"),
            lang=lang,
            romanize=False,
            lower_case=True, gzip=False,
            verbose=True)
        BPEfastApply(
            str(tmpdir / "token"),
            str(tmpdir / "bpe"),
            bpe_code_path,
            verbose=True, over_write=True)
        EncodeFile(
            encoder,
            str(tmpdir / "bpe"),
            str(tmpdir / "enc"),
            verbose=True, over_write=True)
        if returns == "embeddings":
            return np.fromfile(str(tmpdir / "enc"), dtype=np.float32, count=-1)
        data, index = IndexCreate(
            str(tmpdir / "enc"), 'FlatL2',
            verbose=True, save_index=False)
        return data, index

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Number of comments are changing after embedding #116

Number of comments are changing after embedding #116

TahaMunir1 commented Jan 1, 2020

Number of comments are changing after embedding #116

Number of comments are changing after embedding #116

Comments

TahaMunir1 commented Jan 1, 2020