Merge pull request #23 from uclatommy/release-0.3.0

Release 0.3.0
uclatommy · Mar 6, 2017 · 261b308 · 261b308
2 parents 4a90d6e + b3978f5
commit 261b308
Show file tree

Hide file tree

Showing 9 changed files with 476 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -119,11 +119,11 @@ Timer completed. Disconnecting now...
 [Mon Feb 20 23:47:58 2017] Sentiment Score: -0.11307793897469191
 >>> trump_feels.stop()
 ```
-    
+
 **Note:** Trump is an extremely high volume topic. We ran this for roughly 6 minutes and gathered nearly 15,000 tweets! For lower volume topics, you may want to poll the sentiment value less frequently than every 10 seconds.
 
 #### Stream tweets continuously for another topic and save to a different database.
-    
+
 ```python
 >>> tesla_feels = TweetFeels(login, tracking=['tesla', 'tsla', 'gigafactory', 'elonmusk'], db='tesla.sqlite')
 >>> tesla_feels.calc_every_n = 10
@@ -147,14 +147,15 @@ To correct for these effects, we time-box every second and do not discard the se
 
 ![f1]
 
-Where ![f2] is the aggregate sentiment at time t and ![f3] is the sentiment score for the current time-box. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's follower count as the measure of influence.
+Where ![f2] is the aggregate sentiment at time t, ![f3] is the sentiment score for the current time-box, and ![f5] is the fall-off factor between 0 and 1. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's follower count as the measure of influence.
 
 Some tweets will also have a neutral score (0.0). In these cases, we exclude it from aggregation.
 
-[f1]: http://chart.apis.google.com/chart?cht=tx&chl=S_{t}=0.99S_{t-1}%2B0.01s_t
+[f1]: http://chart.apis.google.com/chart?cht=tx&chl=S_{t}=%5calpha{S_{t-1}}%2B(1-%5calpha)s_t
 [f2]: http://chart.apis.google.com/chart?cht=tx&chl=S_t
 [f3]: http://chart.apis.google.com/chart?cht=tx&chl=s_t
 [f4]: http://chart.apis.google.com/chart?cht=tx&chl=S_0=0
+[f5]: http://chart.apis.google.com/chart?cht=tx&chl=%5calpha
 
 ## Caveats
 The trained dataset that comes with [vaderSentiment](https://github.com/cjhutto/vaderSentiment) is optimized for social media, so it can recognize the sentiment embedded in neologisms, internet shorthand, and even emoticons. However, it can only measure the aggregate sentiment value of a sentence or group of words. It does not measure whether or not a tweet agrees or disagrees with a particular ideology, political figure, or party. Although it is generally true that statements of disagreement will tend to have a negative sentiment. As an illustration, have a look at a few sentiment scores from the trump dataset:

diff --git a/setup.py b/setup.py
@@ -1,12 +1,20 @@
 #from distutils.core import setup
 from setuptools import setup
+import os
+
+try:
+    import pypandoc
+    long_description = pypandoc.convert('README.md', 'rst')
+except(IOError, ImportError):
+    long_description = 'Real-time sentiment analysis for twitter.'
 
 filename = 'tweetfeels/version.py'
 exec(compile(open(filename, "rb").read(), filename, 'exec'))
 
 setup(name='tweetfeels',
       version=__version__,
       description='Real-time sentiment analysis for twitter.',
+      long_description=long_description,
       author='Thomas Chen',
       author_email='tkchen@gmail.com',
       url='https://github.com/uclatommy/tweetfeels',

diff --git a/test/test_data.py b/test/test_data.py
@@ -1,16 +1,39 @@
 import unittest
 from tweetfeels import TweetData
 from tweetfeels import Tweet
-from datetime import datetime
+from datetime import datetime, timedelta
 import json
 import os
+import pandas as pd
+import numpy as np
 
 
 class Test_Data(unittest.TestCase):
     def setUp(self):
         self.tweets_data_path = 'test/sample.json'
         self.db = './test.sqlite'
         self.feels_db = TweetData(self.db)
+        self.tweets = [
+            {'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
+             'id_str': '833394296418082817',
+             'text': 'Tweetfeels is tremendous! Believe me. I know.',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            }, # sentiment value = 0
+            {'created_at': 'Sun Feb 20 19:14:19 +0000 2017',
+             'id_str': '833394296418082818',
+             'text': 'Fake news. Sad!',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            }, # sentiment value = -0.7351
+            {'created_at': 'Sun Feb 21 19:14:20 +0000 2017',
+             'id_str': '833394296418082819',
+             'text': 'I hate it.',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            } # sentiment value = -0.5719
+            ]
+        self.mock_tweets = [Tweet(t) for t in self.tweets]
 
     def tearDown(self):
         os.remove(self.db)
@@ -23,10 +46,53 @@ def test_fields(self):
         self.assertTrue(isinstance(f, tuple))
         self.assertTrue(len(f)>=11)
 
-    def test_scrub(self):
-        data = {'a':1, 'b':2}
-        scrubbed = self.feels_db.scrub(data)
-        self.assertTrue(isinstance(scrubbed, str))
+    def test_start(self):
+        self.assertTrue(isinstance(self.feels_db.start, datetime))
+
+    def test_dates(self):
+        for t in self.mock_tweets:
+            self.feels_db.insert_tweet(t)
+        self.assertEqual(len(self.feels_db.tweet_dates), 3)
+
+        tweets = []
+        with open(self.tweets_data_path) as tweets_file:
+            lines = filter(None, (line.rstrip() for line in tweets_file))
+            for line in lines:
+                try:
+                    tweets.append(Tweet(json.loads(line)))
+                except KeyError:
+                    pass
+        for t in tweets:
+            self.feels_db.insert_tweet(t)
+        self.assertEqual(len(self.feels_db.tweet_dates), 105)
+        df = self.feels_db.tweet_dates
+        timebox = timedelta(seconds=60)
+        second = timedelta(seconds=1)
+        df = df.groupby(pd.TimeGrouper(freq=f'{int(timebox/second)}S')).size()
+        df = df[df != 0]
+        print(df)
+        self.assertEqual(len(df), 3)
+        self.assertEqual(df.iloc[0], 103)
+
+    def test_fetch(self):
+        tweets = []
+        with open(self.tweets_data_path) as tweets_file:
+            lines = filter(None, (line.rstrip() for line in tweets_file))
+            for line in lines:
+                try:
+                    tweets.append(Tweet(json.loads(line)))
+                except KeyError:
+                    pass
+        for t in tweets:
+            self.feels_db.insert_tweet(t)
+
+        for t in self.mock_tweets:
+            self.feels_db.insert_tweet(t)
+
+        it = self.feels_db.fetchbin(binsize=timedelta(minutes=30))
+        self.assertEqual(len(next(it)), 103)
+        self.assertEqual(len(next(it)), 1)
+        self.assertEqual(len(next(it)), 1)
 
     def test_data_operation(self):
         twt = {'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
@@ -35,14 +101,21 @@ def test_data_operation(self):
         t = Tweet(twt)
         self.assertEqual(len(t.keys()), 7)
         self.feels_db.insert_tweet(t)
-        dfs = self.feels_db.tweets_since(datetime.now())
-        for df in dfs:
-            self.assertEqual(len(df), 0)
-        dfs = self.feels_db.tweets_since(0)
-        for df in dfs:
-            self.assertEqual(len(df), 1)
-            df.sentiment = 0.9
-            for row in df.itertuples():
-                self.feels_db.update_tweet(
-                    {'id_str': row.id_str, 'sentiment': row.sentiment}
-                    )
+        df = self.feels_db.tweets_since(datetime.now())
+        self.assertEqual(len(df), 0)
+        df = self.feels_db.tweets_since(0)
+        self.assertEqual(len(df), 1)
+        df.sentiment = 0.9
+        for row in df.itertuples():
+            self.feels_db.update_tweet(
+                {'id_str': row.id_str, 'sentiment': row.sentiment}
+                )
+
+        start = datetime(2017, 2, 17, 0, 0, 0)
+        before = datetime(2017, 2, 18, 0, 0, 0)
+        after = datetime(2017, 2, 20, 0, 0, 0)
+        df = self.feels_db.tweets_between(start, before)
+        self.assertEqual(len(df), 0)
+
+        df = self.feels_db.tweets_between(start, after)
+        self.assertEqual(len(df), 1)
diff --git a/test/test_feels.py b/test/test_feels.py
@@ -3,6 +3,8 @@
 import json
 import os
 import time
+import numpy as np
+from datetime import datetime, timedelta
 
 from tweetfeels import (TweetFeels, Tweet, TweetData)
 
@@ -14,14 +16,41 @@ def setUp(self):
         TweetFeels._listener_factory = (lambda ctrl: MagicMock())
         TweetFeels._stream_factory = (lambda auth, listener: MagicMock())
         self.tweets_data_path = 'test/sample.json'
+        self.tweets = [
+            {'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
+             'id_str': '833394296418082817',
+             'text': 'Tweetfeels is tremendous! Believe me. I know.',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            }, # sentiment value = 0
+            {'created_at': 'Sun Feb 20 19:14:19 +0000 2017',
+             'id_str': '833394296418082818',
+             'text': 'Fake news. Sad!',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            }, # sentiment value = -0.7351
+            {'created_at': 'Sun Feb 21 19:14:20 +0000 2017',
+             'id_str': '833394296418082819',
+             'text': 'I hate it.',
+             'user': {'followers_count': '100', 'friends_count': '200',
+                      'location':None}
+            } # sentiment value = -0.5719
+            ]
+        self.mock_feels = TweetFeels('abcd')
+        self.feels_db = TweetData(file='./test/db.sqlite')
+        self.mock_feels._feels = self.feels_db
+        self.mock_tweets = [Tweet(t) for t in self.tweets]
+
+    def tearDown(self):
+        os.remove('./test/db.sqlite')
 
     def test_start(self):
         mock_feels = TweetFeels("abcd")
         mock_feels.tracking = []
-        mock_feels.start()
+        mock_feels.start(selfupdate=0)
         mock_feels._stream.filter.assert_not_called()
         mock_feels.tracking = ['tsla']
-        mock_feels.start()
+        mock_feels.start(selfupdate=0)
         mock_feels._stream.filter.assert_called_once()
 
     def test_stop(self):
@@ -55,6 +84,9 @@ def test_sentiment(self):
         mock_feels = TweetFeels("abcd")
         mock_feels._feels.tweets_since = MagicMock(return_value=[])
         mock_feels._sentiment = 0.5
+        mock_feels._latest_calc = datetime(2017, 1, 1, 0, 0, 0)
+        mock_feels._feels.start = datetime(2017, 1, 1, 0, 0, 0)
+        mock_feels._feels.end = datetime(2017, 1, 1, 0, 0, 0)
         self.assertEqual(mock_feels.sentiment, 0.5)
 
     def test_buffer(self):
@@ -76,3 +108,45 @@ def test_buffer(self):
             dfs = [df for df in mock_feels._feels.all]
             self.assertEqual(len(dfs[0]), 6)
         os.remove('sample.sqlite')
+
+    def test_sentiment_comprehensive(self):
+        sentiment = 0.0
+        for t in self.mock_tweets:
+            self.feels_db.insert_tweet(t)
+            if t['sentiment']!=0:
+                # print(f'0.99*{sentiment} + 0.01*{t["sentiment"]}')
+                sentiment = 0.99*sentiment + 0.01*t['sentiment']
+                # print(f'sentiment = {sentiment}')
+        self.mock_feels.clear_buffer()
+        # calc = 0*0.99**2 + 0.01*0.99*-0.7531 + 0.01*-0.5719
+        #      = -0.01299649
+        self.mock_feels._latest_calc = self.mock_feels._feels.start
+        self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
+        # first observation is at 2017-2-19 19:14:18 and we are using default
+        # 60 second bins, therefore the observation at 2017-2-21 19:14:20 will
+        # never get saved but will always be recalculated.
+        self.assertEqual(self.mock_feels._latest_calc,
+                         datetime(2017, 2, 21, 19, 14, 18))
+
+        # repeat the calculation, nothing changes
+        self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
+        self.assertEqual(self.mock_feels._latest_calc,
+                         datetime(2017, 2, 21, 19, 14, 18))
+
+    def test_sentiments(self):
+        for t in self.mock_tweets:
+            self.feels_db.insert_tweet(t)
+        self.mock_feels.clear_buffer()
+        self.mock_feels.calc_every_n = 1
+        start = datetime(2017, 2, 19, 0, 0, 0)
+        dt = timedelta(days=1)
+        sentiment = self.mock_feels.sentiments(strt=start, delta_time=dt)
+        self.assertTrue(np.isclose(next(sentiment), 0))
+        self.assertTrue(np.isclose(next(sentiment), -0.007351))
+        self.assertTrue(np.isclose(next(sentiment), -0.01299649))
+        for s in sentiment:
+            print(s)
+        # we are starting at 2017-2-19 19:00:00 and using bins with length 1 day
+        # therefore our latest calc will be just prior to the final observation.
+        self.assertEqual(self.mock_feels._latest_calc,
+                         datetime(2017, 2, 21, 0, 0, 0))
diff --git a/test/test_listener.py b/test/test_listener.py
@@ -41,29 +41,29 @@ def test_on_disconnect(self, mock_feels):
     @patch('tweetfeels.TweetFeels')
     def test_on_connect(self, mock_feels):
         tl = TweetListener(mock_feels)
-        tl.waited = 60
+        tl._waited = 60
         tl.on_connect()
-        self.assertEqual(tl.waited, 0)
+        self.assertEqual(tl._waited, 0)
 
     @patch('tweetfeels.TweetFeels')
     def test_on_error(self, mock_feels):
         tl = TweetListener(mock_feels)
         tl.reconnect_wait = MagicMock()
         tl.on_error(420)
         tl.reconnect_wait.assert_called_with('exponential')
-        self.assertEqual(tl.waited, 60)
+        self.assertEqual(tl._waited, 60)
         mock_feels.on_error.assert_called_with(420)
 
     @patch('tweetfeels.TweetFeels')
     def test_reconnect_wait(self, mock_feels):
         tl = TweetListener(mock_feels)
-        tl.waited = 0.1
+        tl._waited = 0.1
         tl.reconnect_wait('linear')
-        self.assertEqual(tl.waited, 1.1)
-        tl.waited = 0.1
+        self.assertEqual(tl._waited, 1.1)
+        tl._waited = 0.1
         tl.reconnect_wait('exponential')
         tl.reconnect_wait('exponential')
-        self.assertEqual(tl.waited, 0.4)
+        self.assertEqual(tl._waited, 0.4)
 
 
 class Test_Tweet(unittest.TestCase):