Skip to content

Commit

Permalink
Added stopwords for many languages
Browse files Browse the repository at this point in the history
  • Loading branch information
bendangelo committed Dec 30, 2023
1 parent b903177 commit 58b9272
Show file tree
Hide file tree
Showing 74 changed files with 21,304 additions and 620 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,19 @@ require 'keyphrase'
keyphrase = Keyphrase.new
```

Use the Smart Stoplist:
Use a custom stopword list:

```
keyphrase.analyse "your text", stoplist: Keyphrase.stopwords[:en]
keyphrase.analyse "your text", stopwords: %w{words to remove here}
# → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667}
```

Use a custom stopword list:
Use a stopword list for a language:

```
keyphrase.analyse "your text", ["custom","stopword","list"]
keyphrase.analyse "your text", lang: :kor
```
See `lib/keyphrase/stoplist` for all supported languages.

Shorthand usage:

Expand Down
32 changes: 20 additions & 12 deletions lib/keyphrase.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ class Keyphrase

autoload :Stoplist, "keyphrase/stoplist"

CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
CLEAN_REGEX = /([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
CLEAN_SPACES_REGEX = /\s+/
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u

Expand All @@ -16,8 +16,12 @@ def self.analyse text, options={}
@@keyphrase.analyse text, options
end

def initialize
@cached_regex = {}
end

def analyse text, options={}
stoplist = options[:stoplist] || :smart
stopwords = options[:stopwords]
lang = options[:lang] || :eng
clean_regex = options[:clean] || CLEAN_REGEX
position_bonus = options[:position_bonus] || true
Expand All @@ -26,7 +30,7 @@ def analyse text, options={}
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX

pattern = buildStopwordRegExPattern stoplist, lang
pattern = buildStopwordRegExPattern lang, stopwords
sentences = text.split sentences_regex
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
wordscores = calculateWordScores phrases
Expand All @@ -49,16 +53,20 @@ def analyse text, options={}

# create stopword pattern
# 1
def buildStopwordRegExPattern stopwords, lang

if stopwords.is_a? Symbol
# use caching
return Keyphrase::Stoplist.stopwords lang, stopwords
def buildStopwordRegExPattern lang, stopwords = nil
stopwords ||= Keyphrase::Stoplist.stopwords_for_lang lang

# Check if the regex for the given language and stopwords is already cached
if @cached_regex[lang].nil? || @cached_regex[lang][:stopwords] != stopwords
# If not cached or stopwords have changed, recompile the regex and store in the cache
@cached_regex[lang] = {
stopwords: stopwords,
regex: Regexp.new("(?:^|\\s)(?:#{stopwords.join('|')})(?:$|\\s)", Regexp::IGNORECASE | Regexp::MULTILINE)
}
end

stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io

return stop_regex
# Return the cached regex
@cached_regex[lang][:regex]
end

# generate candidate keywords
Expand Down
23 changes: 13 additions & 10 deletions lib/keyphrase/stoplist.rb
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
module Keyphrase::Stoplist
class << self
# Class variable to store filenames
@@file_names = []

# Method to retrieve the array of filenames
def languages
@@file_names
end
end

# Dynamically require all files in the stoplist directory
Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
require_relative file
@@file_names << File.basename(file, '.rb').to_sym
end

def self.stopwords lang, type=:smart
def self.stopwords_for_lang lang
cl = const_get(lang.to_s.capitalize)

if type == :strict
cl.strict
else
cl.smart
end
end

def self.stoplist_classes
constants.map { |const| }
cl.stopwords
end

end
14 changes: 14 additions & 0 deletions lib/keyphrase/stoplist/afr.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class Keyphrase
module Stoplist
class Afr
def self.stopwords
@@stopwords ||= %w{
'n aan af al as baie by daar dag dat die dit een ek
en gaan gesê haar het hom hulle hy in is jou jy kan
kom ma maar met my na nie om ons op saam sal se sien
so sy te toe uit van vir was wat ʼn
}
end
end
end
end
10 changes: 10 additions & 0 deletions lib/keyphrase/stoplist/aka.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class Keyphrase
module Stoplist
class Aka
def self.stopwords
@@stopwords ||= %w{
}
end
end
end
end
10 changes: 10 additions & 0 deletions lib/keyphrase/stoplist/amh.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class Keyphrase
module Stoplist
class Amh
def self.stopwords
@@stopwords ||= [
]
end
end
end
end

0 comments on commit 58b9272

Please sign in to comment.