Skip to content

Commit

Permalink
Added strict words, added blacklist, lang, strict eng words, sorted r…
Browse files Browse the repository at this point in the history
…esponses, position bonus, better sentence splitting
  • Loading branch information
bendangelo committed Dec 3, 2023
1 parent ecda708 commit bd432a8
Show file tree
Hide file tree
Showing 4 changed files with 1,406 additions and 33 deletions.
40 changes: 29 additions & 11 deletions lib/keyphrase.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class Keyphrase
autoload :Stoplist, "keyphrase/stoplist"

CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
SENTENCES_REGEX = /[!?,;:\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.[^a-zA-Z0-9]/u
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u

def self.analyse text, options={}
@@keyphrase ||= Keyphrase.new
Expand All @@ -16,17 +17,23 @@ def self.analyse text, options={}

def analyse text, options={}
stoplist = options[:stoplist] || :smart
lang = options[:lang] || :eng
clean_regex = options[:clean] || CLEAN_REGEX
position_bonus = options[:position_bonus] || true
sort = options[:sort] || true

pattern = buildStopwordRegExPattern stoplist
pattern = buildStopwordRegExPattern stoplist, lang
sentences = text.split SENTENCES_REGEX
phrases = generateCandidateKeywords sentences, pattern, clean_regex
wordscores = calculateWordScores phrases
candidates = generateCandidateKeywordScores phrases, wordscores
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus

if sort
candidates = candidates.sort_by{|k,v| -v}.to_h
end

if options[:verbose]
result = candidates.sort_by{|k,v| v}.reverse
result.each do |word, score|
candidates.each do |word, score|
puts sprintf '%.2f - %s', score, word
end
end
Expand All @@ -38,11 +45,11 @@ def analyse text, options={}

# create stopword pattern
# 1
def buildStopwordRegExPattern stopwords
def buildStopwordRegExPattern stopwords, lang

if stopwords.is_a? Symbol
# use caching
return Keyphrase::Stoplist::Eng.smart_regex
return Keyphrase::Stoplist.stopwords lang, stopwords
end

stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
Expand All @@ -55,18 +62,21 @@ def buildStopwordRegExPattern stopwords
def generateCandidateKeywords sentences, stopwords_regex, clean_regex
phrases = Array.new

filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, "").gsub(stopwords_regex, "|") }
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }

filtered_sentences.each do |parts|
parts.split("|").each do |part|
part = part.strip
part = part.gsub(BLACKLIST_REGEX, " ").strip

if !part.empty?
phrases.push part
end
end
end

# remove duplicate keywords
phrases = phrases.uniq(&:downcase)

return phrases
end

Expand Down Expand Up @@ -102,14 +112,22 @@ def calculateWordScores phrases

# generate candidate keyword scores
# 4
def generateCandidateKeywordScores phrases, scores
def generateCandidateKeywordScores phrases, scores, position_bonus
candidates = Hash.new 0
word_index = 0

phrases.each do |phrase|
words = seperateWords(phrase)
score = 0
words.each do |word|
score += scores[word]

# Normalize the score based on the position
if position_bonus
normalized_score = 1.0 / (word_index + 1)
score += normalized_score
word_index += 1
end
end
candidates[phrase] = score
end
Expand All @@ -120,7 +138,7 @@ def generateCandidateKeywordScores phrases, scores
def seperateWords text
words = Array.new

text.split(/[^a-zA-Z0-9_\\+\\-\\']/).each do |word|
text.split(/[^a-zA-Z0-9_\\+\\-\\'\\.]/).each do |word|
word = word.strip.downcase
if !word.empty? && !(true if Float(word) rescue false)
words.push word
Expand Down
12 changes: 9 additions & 3 deletions lib/keyphrase/stoplist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@ module Keyphrase::Stoplist
require_relative file
end

def self.stopwords
stoplist_classes.map { |klass| klass.stopwords }
def self.stopwords lang, type=:smart
cl = const_get(lang.to_s.capitalize)

if type == :strict
cl.strict
else
cl.smart
end
end

def self.stoplist_classes
constants.map { |const| const_get(const) }
constants.map { |const| }
end

end

0 comments on commit bd432a8

Please sign in to comment.