Skip to content

Commit

Permalink
Allow numbers with text to pass through
Browse files Browse the repository at this point in the history
  • Loading branch information
bendangelo committed Jan 1, 2024
1 parent 898353e commit aef32a7
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
keyphrase (0.1.3)
keyphrase (0.2.1)

GEM
remote: https://rubygems.org/
Expand Down
4 changes: 2 additions & 2 deletions lib/keyphrase.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ class Keyphrase
autoload :Stoplist, "keyphrase/stoplist"

CLEAN_REGEX = /([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
CLEAN_SPACES_REGEX = /\s+/
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}0-9]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
CLEAN_SPACES_REGEX = /^[0-9\s\.]+$|\s+/ # last phase. Remove extra whitespace and lone numbers
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u

def self.analyse text, options={}
Expand Down
2 changes: 1 addition & 1 deletion lib/keyphrase/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

class Keyphrase
VERSION = "0.2.1"
VERSION = "0.2.2"
end
14 changes: 10 additions & 4 deletions spec/keyphrase_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
it do
result = Keyphrase.analyse "SEMI FINAL! - FURIA vs HAVU - HIGHLIGHTS - Elisa Masters 2023 l CS2"

expect(result.keys).to eq ["SEMI FINAL", "Elisa Masters", "FURIA", "HAVU", "HIGHLIGHTS", "CS2"]
expect(result.keys).to eq ["SEMI FINAL", "Elisa Masters 2023", "FURIA", "HAVU", "HIGHLIGHTS", "CS2"]
end

it do
result = Keyphrase.analyse "FFT AI Battles 2023 Edition! Doggosupreme (Tiny Baby Squad) vs Skadi (Bioshock 2) 1.7"

expect(result.keys).to eq ["Tiny Baby Squad", "Battles 2023 Edition", "FFT", "Doggosupreme", "Skadi", "Bioshock 2"]
end

it "should remove duplicate words" do
Expand Down Expand Up @@ -76,13 +82,13 @@
it "should split on slashes" do
result = Keyphrase.analyse "GTA ON LINE 1.39 NEWSAVE CEO OUTFIT SOLO / SALVARE I COMPLETI CEO DA SOLI"

expect(result.keys).to eq ["NEWSAVE CEO OUTFIT SOLO", "COMPLETI CEO DA SOLI", "GTA", "SALVARE"]
expect(result.keys).to eq ["1.39 NEWSAVE CEO OUTFIT SOLO", "COMPLETI CEO DA SOLI", "GTA", "SALVARE"]
end

it "should remove * ! and words with numbers" do
result = Keyphrase.analyse "*New*Heavy Sniper!!!Fortnite Battle Royale!!!Level 50//5000+Kills//110+Wins!!!"

expect(result.keys).to eq ["Fortnite Battle Royale", "Heavy Sniper", "Level", "Kills", "Wins"]
expect(result.keys).to eq ["Fortnite Battle Royale", "Heavy Sniper", "Level 50", "Kills", "Wins"]
end

it "should split on [] {} <>"do
Expand All @@ -94,7 +100,7 @@
it "should remove numbered dates" do
result = Keyphrase.analyse "Video Beremo 07.04.2018 Nova24TV"

expect(result.keys).to eq ["Video Beremo Nova24TV"]
expect(result.keys).to eq ["Video Beremo 07.04.2018 Nova24TV"]
end

it "should remove stop words from Chinese" do
Expand Down

0 comments on commit aef32a7

Please sign in to comment.