Url matchers

vladimir-tikhonov · Mar 13, 2015 · eff104f · eff104f
1 parent 5b0ed61
commit eff104f
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -91,6 +91,13 @@ r1 = Regexy::Web::IPv6.new             # matches '::1', '2001:DB8::8:800:200C:41
 r1 = Regexy::Web::IPv6.new(:with_port) # matches '[::1]:80' and so on
 any_ipv6 = Regexy::Web::IPv6.new(:normal) | Regexy::Web::IPv6.new(:with_port) # matches ip w\ and w\o port
 ```
+### Regexy::Web::Url
+
+Generates regular expressions for matching Url addresses (with unicode support).
+
+```ruby
+r1 = Regexy::Web::Url.new # matches 'http://foo.com', 'www.foo.com' and 'foo.com'
+```
 
 ## Contributing
 Have an idea of new regular expression? Create an [issue](https://github.com/vladimir-tikhonov/regexy/issues) (some test cases will be much appreciated) or open a [pull request](https://github.com/vladimir-tikhonov/regexy/pulls).
diff --git a/lib/regexy/web.rb b/lib/regexy/web.rb
@@ -3,5 +3,6 @@ module Web
     autoload :Email, 'regexy/web/email'
     autoload :IPv4,  'regexy/web/ip'
     autoload :IPv6,  'regexy/web/ip'
+    autoload :Url,   'regexy/web/url'
   end
 end
diff --git a/lib/regexy/web/url.rb b/lib/regexy/web/url.rb
@@ -0,0 +1,20 @@
+# encoding: UTF-8
+
+module Regexy
+  module Web
+    class Url < ::Regexy::Regexp
+      URL = /^([a-z][a-z\d+\-.]*:(\/\/([\p{L}\d\-._~%!$&'()*+,;=]+@)?([\p{L}\d\-._~%]+|
+             \[[\p{L}\d:.]+\]|\[v[a-f0-9][\p{L}\d\-._~%!$&'()*+,;=:]+\])(:[0-9]+)?
+             (\/[\p{L}\d\-._~%!$&'()*+,;=:@]+)*\/?|(\/?[\p{L}\d\-._~%!$&'()*+,;=:@]+
+             (\/[\p{L}\d\-._~%!$&'()*+,;=:@]+)*\/?)?)|([\p{L}\d\-._~%!$&'()*+,;=@]+
+             (\/[\p{L}\d\-._~%!$&'()*+,;=:@]+)*\/?|(\/[\p{L}\d\-._~%!$&'()*+,;=:@]+)
+             +\/?))
+             (\?[\p{L}\d\-._~%!$&'()*+,;=:@\/?]*)?(\#[\p{L}\d\-._~%!$&'()*+,;=:@\/?]*)?$
+            /ix.freeze
+
+      def initialize(*args)
+        super(URL, *args)
+      end
+    end
+  end
+end
diff --git a/spec/web/url_spec.rb b/spec/web/url_spec.rb
@@ -0,0 +1,66 @@
+# encoding: UTF-8
+
+describe Regexy::Web::Url do
+  VALID_URL = [
+    'http://foo.com/blah_blah',
+    'http://foo.com/blah_blah/',
+    'http://foo.com/blah_blah_(wikipedia)',
+    'http://foo.com/blah_blah_(wikipedia)_(again)',
+    'http://www.example.com/wpstyle/?p=364',
+    'https://www.example.com/foo/?bar=baz&inga=42&quux',
+    'http://userid@example.com',
+    'http://userid@example.com/',
+    'http://userid@example.com:8080',
+    'http://userid@example.com:8080/',
+    'http://142.42.1.1/',
+    'http://foo.com/blah_(wikipedia)#cite-1',
+    'http://foo.com/blah_(wikipedia)_blah#cite-1',
+    'http://foo.com/(something)?after=parens',
+    'http://code.google.com/events/#&product=browser',
+    'http://j.mp',
+    'ftp://foo.bar/baz',
+    'http://foo.bar/?q=Test%20URL-encoded%20stuff',
+    'http://مثال.إختبار',
+    'http://例子.测试',
+    'http://1337.net',
+    'http://a.b-c.de',
+    'http://223.255.255.254',
+    'http://киррилический/адрес.рф',
+    'www.foo.bar',
+    'foo.bar',
+    'foo.bar#anchor'
+  ]
+
+  INVALID_URL = [
+    'http://',
+    'http://?',
+    'http://??',
+    'http://??/',
+    'http://#',
+    'http://##',
+    'http://##/',
+    'http://foo.bar?q=Spaces should be encoded',
+    '//',
+    '//a',
+    '///a',
+    '///',
+    'http:///a',
+    'http:// shouldfail.com',
+    ':// should fail',
+    'http://foo.bar/foo(bar)baz quux',
+  ]
+
+  let(:r) { Regexy::Web::Url.new }
+
+  it 'accepts valid url' do
+    VALID_URL.each do |url|
+      expect(url =~ r).to be_truthy
+    end
+  end
+
+  it 'declines invalid url' do
+    INVALID_URL.each do |url|
+      expect(url =~ r).to be_nil
+    end
+  end
+end