You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
thanks for sharing! here's the rake.py file edited to use spacy instead of nltk. it removes certain verb types in _get_phrase_list_from_words, which i found to improve performance a bit (in small sample size).
# -*- coding: utf-8 -*- """Implementation of Rapid Automatic Keyword Extraction algorithm. As described in the paper Automatic keyword extraction from individual
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
"""
ADAPtED tO USE SPACY INStEAD OF NLtK
import string
from collections import Counter, defaultdict
from itertools import chain, groupby, product
from enum import Enum
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')
class Metric(Enum):
"""Different metrics that can be used for ranking."""
DEGREE_TO_FREQUENCY_RATIO = 0 # Uses d(w)/f(w) as the metric
WORD_DEGREE = 1 # Uses d(w) alone as the metric
WORD_FREQUENCY = 2 # Uses f(w) alone as the metric
class Rake(object):
"""Rapid Automatic Keyword Extraction Algorithm."""
def __init__(
self,
stopwords=None,
punctuations=None,
language="english",
ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO,
max_length=100000,
min_length=1,
verb_tags_to_rm=None
):
"""Constructor.
:param stopwords: List of Words to be ignored for keyword extraction.
:param punctuations: Punctuations to be ignored for keyword extraction.
:param language: Language to be used for stopwords
:param max_length: Maximum limit on the number of words in a phrase
(Inclusive. Defaults to 100000)
:param min_length: Minimum limit on the number of words in a phrase
(Inclusive. Defaults to 1)
"""
# By default use degree to frequency ratio as the metric.
if isinstance(ranking_metric, Metric):
self.metric = ranking_metric
else:
self.metric = Metric.DEGREE_TO_FREQUENCY_RATIO
# If stopwords not provided we use language stopwords by default.
self.stopwords = stopwords
if self.stopwords is None:
self.stopwords = list(STOP_WORDS)
# If punctuations are not provided we ignore all punctuation symbols.
self.punctuations = punctuations
if self.punctuations is None:
self.punctuations = string.punctuation
"""
RM: VB VERB VerbForm=inf verb, base form
RM: VBD VERB VerbForm=fin Tense=past verb, past tense
KEEP: VBG VERB VerbForm=part Tense=pres Aspect=prog verb, gerund or present participle
KEEP: VBN VERB VerbForm=part Tense=past Aspect=perf verb, past participle
RM: VBP VERB VerbForm=fin Tense=pres verb, non-3rd person singular present
RM: VBZ VERB VerbForm=fin Tense=pres Number=sing Person=3 verb, 3rd person singular present
"""
self.verb_tags_to_rm = verb_tags_to_rm
if self.verb_tags_to_rm is None:
self.verb_tags_to_rm = set(['VB','VBD','VBP','VBZ'])
# All things which act as sentence breaks during keyword extraction.
self.to_ignore = set(chain(self.stopwords, self.punctuations))
# Assign min or max length to the attributes
self.min_length = min_length
self.max_length = max_length
# Stuff to be extracted from the provided text.
self.frequency_dist = None
self.degree = None
self.rank_list = None
self.ranked_phrases = None
def extract_keywords_from_text(self, text):
"""Method to extract keywords from the text provided.
:param text: Text to extract keywords from, provided as a string.
"""
sentences = [str(s) for s in nlp(text.lower()).sents if str(s) not in {'.!?'}]
# sentences = nltk.tokenize.sent_tokenize(text)
self.extract_keywords_from_sentences(sentences)
def extract_keywords_from_sentences(self, sentences):
"""Method to extract keywords from the list of sentences provided.
:param sentences: Text to extraxt keywords from, provided as a list
of strings, where each string is a sentence.
"""
phrase_list = self._generate_phrases(sentences)
self._build_frequency_dist(phrase_list)
self._build_word_co_occurance_graph(phrase_list)
self._build_ranklist(phrase_list)
def get_ranked_phrases(self):
"""Method to fetch ranked keyword strings.
:return: List of strings where each string represents an extracted
keyword string.
"""
return self.ranked_phrases
def get_ranked_phrases_with_scores(self):
"""Method to fetch ranked keyword strings along with their scores.
:return: List of tuples where each tuple is formed of an extracted
keyword string and its score. Ex: (5.68, 'Four Scoures')
"""
return self.rank_list
def get_word_frequency_distribution(self):
"""Method to fetch the word frequency distribution in the given text.
:return: Dictionary (defaultdict) of the format `word -> frequency`.
"""
return self.frequency_dist
def get_word_degrees(self):
"""Method to fetch the degree of words in the given text. Degree can be
defined as sum of co-occurances of the word with other words in the
given text.
:return: Dictionary (defaultdict) of the format `word -> degree`.
"""
return self.degree
def _build_frequency_dist(self, phrase_list):
"""Builds frequency distribution of the words in the given body of text.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
self.frequency_dist = Counter(chain.from_iterable(phrase_list))
def _build_word_co_occurance_graph(self, phrase_list):
"""Builds the co-occurance graph of words in the given body of text to
compute degree of each word.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
for phrase in phrase_list:
# For each phrase in the phrase list, count co-occurances of the
# word with other words in the phrase.
#
# Note: Keep the co-occurances graph as is, to help facilitate its
# use in other creative ways if required later.
for (word, coword) in product(phrase, phrase):
co_occurance_graph[word][coword] += 1
self.degree = defaultdict(lambda: 0)
for key in co_occurance_graph:
self.degree[key] = sum(co_occurance_graph[key].values())
def _build_ranklist(self, phrase_list):
"""Method to rank each contender phrase using the formula
phrase_score = sum of scores of words in the phrase.
word_score = d(w)/f(w) where d is degree and f is frequency.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
self.rank_list = []
for phrase in phrase_list:
rank = 0.0
for word in phrase:
if self.metric == Metric.DEGREE_TO_FREQUENCY_RATIO:
rank += 1.0 * self.degree[word] / self.frequency_dist[word]
elif self.metric == Metric.WORD_DEGREE:
rank += 1.0 * self.degree[word]
else:
rank += 1.0 * self.frequency_dist[word]
self.rank_list.append((rank, " ".join(phrase)))
self.rank_list.sort(reverse=True)
self.ranked_phrases = [ph[1] for ph in self.rank_list]
def _generate_phrases(self, sentences):
"""Method to generate contender phrases given the sentences of the text
document.
:param sentences: List of strings where each string represents a
sentence which forms the text.
:return: Set of string tuples where each tuple is a collection
of words forming a contender phrase.
"""
phrase_list = set()
# Create contender phrases from sentences.
for sentence in sentences:
word_list, words_to_rm = [], set()
for d in nlp(sentence):
tok_str = str(d).lower()
if tok_str not in {'.!?'}:
word_list.append(tok_str)
if d.tag_ in self.verb_tags_to_rm:
words_to_rm.add(tok_str)
# word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
phrase_list.update(self._get_phrase_list_from_words(word_list, words_to_rm))
return phrase_list
def _get_phrase_list_from_words(self, word_list, words_to_rm):
"""Method to create contender phrases from the list of words that form
a sentence by dropping stopwords and punctuations and grouping the left
words into phrases. Only phrases in the given length range (both limits
inclusive) would be considered to build co-occurrence matrix. Ex:
Sentence: Red apples, are good in flavour.
List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
List after dropping punctuations and stopwords.
List of words: ['red', 'apples', *, *, good, *, 'flavour']
List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
List of phrases with a correct length:
For the range [1, 2]: [('red', 'apples'), ('good',), ('flavour',)]
For the range [1, 1]: [('good',), ('flavour',)]
For the range [2, 2]: [('red', 'apples')]
:param word_list: List of words which form a sentence when joined in
the same order.
:return: List of contender phrases that are formed after dropping
stopwords and punctuations.
"""
# would rather use an index of word instead of words_to_rm, but can't figure it out
groups = groupby(word_list, lambda x: x not in self.to_ignore and x not in words_to_rm)
phrases = [tuple(group[1]) for group in groups if group[0]]
return list(
filter(
lambda x: self.min_length <= len(x) <= self.max_length, phrases
)
)`
The text was updated successfully, but these errors were encountered:
also, this is a wrapper that consolidates very similar keywords, based on word vector similarity. curious what you think! -Chris
def _get_consolidated_phrases_based_on_similarity(self, ranked_phrases, threshold=0.9):
"""
ranked_phrases is list of str, ordered by first is best keyword, last is worst keyword
"""
grouped_phrases = []
already_grouped_phrases = set()
ranked_phrases_nlp = [nlp(p) for p in ranked_phrases]
for row_p_idx, row_phrase in enumerate(ranked_phrases_nlp):
if row_phrase in already_grouped_phrases:
continue
current_group = [row_phrase]
for col_p_idx, col_phrase in enumerate(ranked_phrases_nlp):
if col_p_idx <= row_p_idx:
continue
if col_phrase in already_grouped_phrases:
continue
if row_phrase.similarity(col_phrase) >= threshold:
current_group.append(col_phrase)
already_grouped_phrases.add(col_phrase)
already_grouped_phrases.add(row_phrase)
grouped_phrases.append(current_group)
if self.debug:
for gp in grouped_phrases:
if len(gp) > 1:
print('\tmerged phrases into one group: {}')
return [p[0] for p in grouped_phrases]
thanks for sharing! here's the rake.py file edited to use spacy instead of nltk. it removes certain verb types in _get_phrase_list_from_words, which i found to improve performance a bit (in small sample size).
# -*- coding: utf-8 -*- """Implementation of Rapid Automatic Keyword Extraction algorithm. As described in the paper
Automatic keyword extraction from individualdocuments` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
"""
ADAPtED tO USE SPACY INStEAD OF NLtK
import string
from collections import Counter, defaultdict
from itertools import chain, groupby, product
from enum import Enum
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')
class Metric(Enum):
"""Different metrics that can be used for ranking."""
class Rake(object):
"""Rapid Automatic Keyword Extraction Algorithm."""
The text was updated successfully, but these errors were encountered: