spacy implementation with certain verb types removed #31

chriseal · 2018-11-03T19:52:58Z

thanks for sharing! here's the rake.py file edited to use spacy instead of nltk. it removes certain verb types in _get_phrase_list_from_words, which i found to improve performance a bit (in small sample size).

# -*- coding: utf-8 -*- """Implementation of Rapid Automatic Keyword Extraction algorithm. As described in the paper Automatic keyword extraction from individual
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
"""

ADAPtED tO USE SPACY INStEAD OF NLtK

import string
from collections import Counter, defaultdict
from itertools import chain, groupby, product

from enum import Enum
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class Metric(Enum):
"""Different metrics that can be used for ranking."""

DEGREE_TO_FREQUENCY_RATIO = 0  # Uses d(w)/f(w) as the metric
WORD_DEGREE = 1  # Uses d(w) alone as the metric
WORD_FREQUENCY = 2  # Uses f(w) alone as the metric

class Rake(object):
"""Rapid Automatic Keyword Extraction Algorithm."""

def __init__(
    self,
    stopwords=None,
    punctuations=None,
    language="english",
    ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
    max_length=100000,
    min_length=1,
    verb_tags_to_rm=None
):
    """Constructor.
    :param stopwords: List of Words to be ignored for keyword extraction.
    :param punctuations: Punctuations to be ignored for keyword extraction.
    :param language: Language to be used for stopwords
    :param max_length: Maximum limit on the number of words in a phrase
                       (Inclusive. Defaults to 100000)
    :param min_length: Minimum limit on the number of words in a phrase
                       (Inclusive. Defaults to 1)
    """

    # By default use degree to frequency ratio as the metric.
    if isinstance(ranking_metric, Metric):
        self.metric = ranking_metric
    else:
        self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO

    # If stopwords not provided we use language stopwords by default.
    self.stopwords = stopwords
    if self.stopwords is None:
        self.stopwords = list(STOP_WORDS)

    # If punctuations are not provided we ignore all punctuation symbols.
    self.punctuations = punctuations
    if self.punctuations is None:
        self.punctuations = string.punctuation

    """
    RM: VB  VERB    VerbForm=inf    verb, base form
    RM: VBD VERB    VerbForm=fin Tense=past verb, past tense
    KEEP: VBG VERB    VerbForm=part Tense=pres Aspect=prog    verb, gerund or present participle
    KEEP: VBN VERB    VerbForm=part Tense=past Aspect=perf    verb, past participle
    RM: VBP VERB    VerbForm=fin Tense=pres verb, non-3rd person singular present
    RM: VBZ VERB    VerbForm=fin Tense=pres Number=sing Person=3    verb, 3rd person singular present
    """
    self.verb_tags_to_rm = verb_tags_to_rm
    if self.verb_tags_to_rm is None:
        self.verb_tags_to_rm = set(['VB','VBD','VBP','VBZ'])

    # All things which act as sentence breaks during keyword extraction.
    self.to_ignore = set(chain(self.stopwords, self.punctuations))

    # Assign min or max length to the attributes
    self.min_length = min_length
    self.max_length = max_length

    # Stuff to be extracted from the provided text.
    self.frequency_dist = None
    self.degree = None
    self.rank_list = None
    self.ranked_phrases = None


def extract_keywords_from_text(self, text):
    """Method to extract keywords from the text provided.
    :param text: Text to extract keywords from, provided as a string.
    """
    sentences = [str(s) for s in nlp(text.lower()).sents if str(s) not in {'.!?'}]
    # sentences = nltk.tokenize.sent_tokenize(text)
    self.extract_keywords_from_sentences(sentences)

def extract_keywords_from_sentences(self, sentences):
    """Method to extract keywords from the list of sentences provided.
    :param sentences: Text to extraxt keywords from, provided as a list
                      of strings, where each string is a sentence.
    """
    phrase_list = self._generate_phrases(sentences)
    self._build_frequency_dist(phrase_list)
    self._build_word_co_occurance_graph(phrase_list)
    self._build_ranklist(phrase_list)

def get_ranked_phrases(self):
    """Method to fetch ranked keyword strings.
    :return: List of strings where each string represents an extracted
             keyword string.
    """
    return self.ranked_phrases

def get_ranked_phrases_with_scores(self):
    """Method to fetch ranked keyword strings along with their scores.
    :return: List of tuples where each tuple is formed of an extracted
             keyword string and its score. Ex: (5.68, 'Four Scoures')
    """
    return self.rank_list

def get_word_frequency_distribution(self):
    """Method to fetch the word frequency distribution in the given text.
    :return: Dictionary (defaultdict) of the format `word -> frequency`.
    """
    return self.frequency_dist

def get_word_degrees(self):
    """Method to fetch the degree of words in the given text. Degree can be
    defined as sum of co-occurances of the word with other words in the
    given text.
    :return: Dictionary (defaultdict) of the format `word -> degree`.
    """
    return self.degree

def _build_frequency_dist(self, phrase_list):
    """Builds frequency distribution of the words in the given body of text.
    :param phrase_list: List of List of strings where each sublist is a
                        collection of words which form a contender phrase.
    """
    self.frequency_dist = Counter(chain.from_iterable(phrase_list))

def _build_word_co_occurance_graph(self, phrase_list):
    """Builds the co-occurance graph of words in the given body of text to
    compute degree of each word.
    :param phrase_list: List of List of strings where each sublist is a
                        collection of words which form a contender phrase.
    """
    co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
    for phrase in phrase_list:
        # For each phrase in the phrase list, count co-occurances of the
        # word with other words in the phrase.
        #
        # Note: Keep the co-occurances graph as is, to help facilitate its
        # use in other creative ways if required later.
        for (word, coword) in product(phrase, phrase):
            co_occurance_graph[word][coword] += 1
    self.degree = defaultdict(lambda: 0)
    for key in co_occurance_graph:
        self.degree[key] = sum(co_occurance_graph[key].values())

def _build_ranklist(self, phrase_list):
    """Method to rank each contender phrase using the formula
          phrase_score = sum of scores of words in the phrase.
          word_score = d(w)/f(w) where d is degree and f is frequency.
    :param phrase_list: List of List of strings where each sublist is a
                        collection of words which form a contender phrase.
    """
    self.rank_list = []
    for phrase in phrase_list:
        rank = 0.0
        for word in phrase:
            if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
                rank += 1.0 * self.degree[word] / self.frequency_dist[word]
            elif self.metric == Metric.WORD_DEGREE:
                rank += 1.0 * self.degree[word]
            else:
                rank += 1.0 * self.frequency_dist[word]
        self.rank_list.append((rank, " ".join(phrase)))
    self.rank_list.sort(reverse=True)
    self.ranked_phrases = [ph[1] for ph in self.rank_list]

def _generate_phrases(self, sentences):
    """Method to generate contender phrases given the sentences of the text
    document.
    :param sentences: List of strings where each string represents a
                      sentence which forms the text.
    :return: Set of string tuples where each tuple is a collection
             of words forming a contender phrase.
    """
    phrase_list = set()
    # Create contender phrases from sentences.
    for sentence in sentences:
        word_list, words_to_rm = [], set()
        for d in nlp(sentence):
            tok_str = str(d).lower()
            if tok_str not in {'.!?'}:
                word_list.append(tok_str)
            if d.tag_ in self.verb_tags_to_rm:
                words_to_rm.add(tok_str)
        # word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
        phrase_list.update(self._get_phrase_list_from_words(word_list, words_to_rm))
    return phrase_list

def _get_phrase_list_from_words(self, word_list, words_to_rm):
    """Method to create contender phrases from the list of words that form
    a sentence by dropping stopwords and punctuations and grouping the left
    words into phrases. Only phrases in the given length range (both limits
    inclusive) would be considered to build co-occurrence matrix. Ex:
    Sentence: Red apples, are good in flavour.
    List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
    List after dropping punctuations and stopwords.
    List of words: ['red', 'apples', *, *, good, *, 'flavour']
    List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
    List of phrases with a correct length:
    For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
    For the range [1, 1]: [('good',), ('flavour',)]
    For the range [2, 2]: [('red', 'apples')]
    :param word_list: List of words which form a sentence when joined in
                      the same order.
    :return: List of contender phrases that are formed after dropping
             stopwords and punctuations.
    """
    # would rather use an index of word instead of words_to_rm, but can't figure it out
    groups = groupby(word_list, lambda x: x not in self.to_ignore and x not in words_to_rm)
    phrases = [tuple(group[1]) for group in groups if group[0]]
    return list(
        filter(
            lambda x: self.min_length <= len(x) <= self.max_length, phrases
        )
    )`

The text was updated successfully, but these errors were encountered:

chriseal · 2018-11-03T19:55:10Z

also, this is a wrapper that consolidates very similar keywords, based on word vector similarity. curious what you think! -Chris

def _get_consolidated_phrases_based_on_similarity(self, ranked_phrases, threshold=0.9):
    """
    ranked_phrases is list of str, ordered by first is best keyword, last is worst keyword
    """

    grouped_phrases = []
    already_grouped_phrases = set()
    ranked_phrases_nlp = [nlp(p) for p in ranked_phrases]
    for row_p_idx, row_phrase in enumerate(ranked_phrases_nlp):
        if row_phrase in already_grouped_phrases:
            continue
        current_group = [row_phrase]
        for col_p_idx, col_phrase in enumerate(ranked_phrases_nlp):
            if col_p_idx <= row_p_idx:
                continue
            if col_phrase in already_grouped_phrases:
                continue
            if row_phrase.similarity(col_phrase) >= threshold:
                current_group.append(col_phrase)
                already_grouped_phrases.add(col_phrase)
            already_grouped_phrases.add(row_phrase)
        grouped_phrases.append(current_group)

    if self.debug:
        for gp in grouped_phrases:
            if len(gp) > 1:
                print('\tmerged phrases into one group: {}')

    return [p[0] for p in grouped_phrases]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

spacy implementation with certain verb types removed #31

spacy implementation with certain verb types removed #31

chriseal commented Nov 3, 2018

chriseal commented Nov 3, 2018

spacy implementation with certain verb types removed #31

spacy implementation with certain verb types removed #31

Comments

chriseal commented Nov 3, 2018

ADAPtED tO USE SPACY INStEAD OF NLtK

chriseal commented Nov 3, 2018