Skip to content

Commit

Permalink
Merge pull request #23 from uclatommy/release-0.3.0
Browse files Browse the repository at this point in the history
Release 0.3.0
  • Loading branch information
Thomas Chen, ASA committed Mar 6, 2017
2 parents 4a90d6e + b3978f5 commit 261b308
Show file tree
Hide file tree
Showing 9 changed files with 476 additions and 81 deletions.
9 changes: 5 additions & 4 deletions README.md
Expand Up @@ -119,11 +119,11 @@ Timer completed. Disconnecting now...
[Mon Feb 20 23:47:58 2017] Sentiment Score: -0.11307793897469191
>>> trump_feels.stop()
```

**Note:** Trump is an extremely high volume topic. We ran this for roughly 6 minutes and gathered nearly 15,000 tweets! For lower volume topics, you may want to poll the sentiment value less frequently than every 10 seconds.

#### Stream tweets continuously for another topic and save to a different database.

```python
>>> tesla_feels = TweetFeels(login, tracking=['tesla', 'tsla', 'gigafactory', 'elonmusk'], db='tesla.sqlite')
>>> tesla_feels.calc_every_n = 10
Expand All @@ -147,14 +147,15 @@ To correct for these effects, we time-box every second and do not discard the se

![f1]

Where ![f2] is the aggregate sentiment at time t and ![f3] is the sentiment score for the current time-box. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's follower count as the measure of influence.
Where ![f2] is the aggregate sentiment at time t, ![f3] is the sentiment score for the current time-box, and ![f5] is the fall-off factor between 0 and 1. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's follower count as the measure of influence.

Some tweets will also have a neutral score (0.0). In these cases, we exclude it from aggregation.

[f1]: http://chart.apis.google.com/chart?cht=tx&chl=S_{t}=0.99S_{t-1}%2B0.01s_t
[f1]: http://chart.apis.google.com/chart?cht=tx&chl=S_{t}=%5calpha{S_{t-1}}%2B(1-%5calpha)s_t
[f2]: http://chart.apis.google.com/chart?cht=tx&chl=S_t
[f3]: http://chart.apis.google.com/chart?cht=tx&chl=s_t
[f4]: http://chart.apis.google.com/chart?cht=tx&chl=S_0=0
[f5]: http://chart.apis.google.com/chart?cht=tx&chl=%5calpha

## Caveats
The trained dataset that comes with [vaderSentiment](https://github.com/cjhutto/vaderSentiment) is optimized for social media, so it can recognize the sentiment embedded in neologisms, internet shorthand, and even emoticons. However, it can only measure the aggregate sentiment value of a sentence or group of words. It does not measure whether or not a tweet agrees or disagrees with a particular ideology, political figure, or party. Although it is generally true that statements of disagreement will tend to have a negative sentiment. As an illustration, have a look at a few sentiment scores from the trump dataset:
Expand Down
8 changes: 8 additions & 0 deletions setup.py
@@ -1,12 +1,20 @@
#from distutils.core import setup
from setuptools import setup
import os

try:
import pypandoc
long_description = pypandoc.convert('README.md', 'rst')
except(IOError, ImportError):
long_description = 'Real-time sentiment analysis for twitter.'

filename = 'tweetfeels/version.py'
exec(compile(open(filename, "rb").read(), filename, 'exec'))

setup(name='tweetfeels',
version=__version__,
description='Real-time sentiment analysis for twitter.',
long_description=long_description,
author='Thomas Chen',
author_email='tkchen@gmail.com',
url='https://github.com/uclatommy/tweetfeels',
Expand Down
105 changes: 89 additions & 16 deletions test/test_data.py
@@ -1,16 +1,39 @@
import unittest
from tweetfeels import TweetData
from tweetfeels import Tweet
from datetime import datetime
from datetime import datetime, timedelta
import json
import os
import pandas as pd
import numpy as np


class Test_Data(unittest.TestCase):
def setUp(self):
self.tweets_data_path = 'test/sample.json'
self.db = './test.sqlite'
self.feels_db = TweetData(self.db)
self.tweets = [
{'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
'id_str': '833394296418082817',
'text': 'Tweetfeels is tremendous! Believe me. I know.',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
}, # sentiment value = 0
{'created_at': 'Sun Feb 20 19:14:19 +0000 2017',
'id_str': '833394296418082818',
'text': 'Fake news. Sad!',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
}, # sentiment value = -0.7351
{'created_at': 'Sun Feb 21 19:14:20 +0000 2017',
'id_str': '833394296418082819',
'text': 'I hate it.',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
} # sentiment value = -0.5719
]
self.mock_tweets = [Tweet(t) for t in self.tweets]

def tearDown(self):
os.remove(self.db)
Expand All @@ -23,10 +46,53 @@ def test_fields(self):
self.assertTrue(isinstance(f, tuple))
self.assertTrue(len(f)>=11)

def test_scrub(self):
data = {'a':1, 'b':2}
scrubbed = self.feels_db.scrub(data)
self.assertTrue(isinstance(scrubbed, str))
def test_start(self):
self.assertTrue(isinstance(self.feels_db.start, datetime))

def test_dates(self):
for t in self.mock_tweets:
self.feels_db.insert_tweet(t)
self.assertEqual(len(self.feels_db.tweet_dates), 3)

tweets = []
with open(self.tweets_data_path) as tweets_file:
lines = filter(None, (line.rstrip() for line in tweets_file))
for line in lines:
try:
tweets.append(Tweet(json.loads(line)))
except KeyError:
pass
for t in tweets:
self.feels_db.insert_tweet(t)
self.assertEqual(len(self.feels_db.tweet_dates), 105)
df = self.feels_db.tweet_dates
timebox = timedelta(seconds=60)
second = timedelta(seconds=1)
df = df.groupby(pd.TimeGrouper(freq=f'{int(timebox/second)}S')).size()
df = df[df != 0]
print(df)
self.assertEqual(len(df), 3)
self.assertEqual(df.iloc[0], 103)

def test_fetch(self):
tweets = []
with open(self.tweets_data_path) as tweets_file:
lines = filter(None, (line.rstrip() for line in tweets_file))
for line in lines:
try:
tweets.append(Tweet(json.loads(line)))
except KeyError:
pass
for t in tweets:
self.feels_db.insert_tweet(t)

for t in self.mock_tweets:
self.feels_db.insert_tweet(t)

it = self.feels_db.fetchbin(binsize=timedelta(minutes=30))
self.assertEqual(len(next(it)), 103)
self.assertEqual(len(next(it)), 1)
self.assertEqual(len(next(it)), 1)

def test_data_operation(self):
twt = {'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
Expand All @@ -35,14 +101,21 @@ def test_data_operation(self):
t = Tweet(twt)
self.assertEqual(len(t.keys()), 7)
self.feels_db.insert_tweet(t)
dfs = self.feels_db.tweets_since(datetime.now())
for df in dfs:
self.assertEqual(len(df), 0)
dfs = self.feels_db.tweets_since(0)
for df in dfs:
self.assertEqual(len(df), 1)
df.sentiment = 0.9
for row in df.itertuples():
self.feels_db.update_tweet(
{'id_str': row.id_str, 'sentiment': row.sentiment}
)
df = self.feels_db.tweets_since(datetime.now())
self.assertEqual(len(df), 0)
df = self.feels_db.tweets_since(0)
self.assertEqual(len(df), 1)
df.sentiment = 0.9
for row in df.itertuples():
self.feels_db.update_tweet(
{'id_str': row.id_str, 'sentiment': row.sentiment}
)

start = datetime(2017, 2, 17, 0, 0, 0)
before = datetime(2017, 2, 18, 0, 0, 0)
after = datetime(2017, 2, 20, 0, 0, 0)
df = self.feels_db.tweets_between(start, before)
self.assertEqual(len(df), 0)

df = self.feels_db.tweets_between(start, after)
self.assertEqual(len(df), 1)
78 changes: 76 additions & 2 deletions test/test_feels.py
Expand Up @@ -3,6 +3,8 @@
import json
import os
import time
import numpy as np
from datetime import datetime, timedelta

from tweetfeels import (TweetFeels, Tweet, TweetData)

Expand All @@ -14,14 +16,41 @@ def setUp(self):
TweetFeels._listener_factory = (lambda ctrl: MagicMock())
TweetFeels._stream_factory = (lambda auth, listener: MagicMock())
self.tweets_data_path = 'test/sample.json'
self.tweets = [
{'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
'id_str': '833394296418082817',
'text': 'Tweetfeels is tremendous! Believe me. I know.',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
}, # sentiment value = 0
{'created_at': 'Sun Feb 20 19:14:19 +0000 2017',
'id_str': '833394296418082818',
'text': 'Fake news. Sad!',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
}, # sentiment value = -0.7351
{'created_at': 'Sun Feb 21 19:14:20 +0000 2017',
'id_str': '833394296418082819',
'text': 'I hate it.',
'user': {'followers_count': '100', 'friends_count': '200',
'location':None}
} # sentiment value = -0.5719
]
self.mock_feels = TweetFeels('abcd')
self.feels_db = TweetData(file='./test/db.sqlite')
self.mock_feels._feels = self.feels_db
self.mock_tweets = [Tweet(t) for t in self.tweets]

def tearDown(self):
os.remove('./test/db.sqlite')

def test_start(self):
mock_feels = TweetFeels("abcd")
mock_feels.tracking = []
mock_feels.start()
mock_feels.start(selfupdate=0)
mock_feels._stream.filter.assert_not_called()
mock_feels.tracking = ['tsla']
mock_feels.start()
mock_feels.start(selfupdate=0)
mock_feels._stream.filter.assert_called_once()

def test_stop(self):
Expand Down Expand Up @@ -55,6 +84,9 @@ def test_sentiment(self):
mock_feels = TweetFeels("abcd")
mock_feels._feels.tweets_since = MagicMock(return_value=[])
mock_feels._sentiment = 0.5
mock_feels._latest_calc = datetime(2017, 1, 1, 0, 0, 0)
mock_feels._feels.start = datetime(2017, 1, 1, 0, 0, 0)
mock_feels._feels.end = datetime(2017, 1, 1, 0, 0, 0)
self.assertEqual(mock_feels.sentiment, 0.5)

def test_buffer(self):
Expand All @@ -76,3 +108,45 @@ def test_buffer(self):
dfs = [df for df in mock_feels._feels.all]
self.assertEqual(len(dfs[0]), 6)
os.remove('sample.sqlite')

def test_sentiment_comprehensive(self):
sentiment = 0.0
for t in self.mock_tweets:
self.feels_db.insert_tweet(t)
if t['sentiment']!=0:
# print(f'0.99*{sentiment} + 0.01*{t["sentiment"]}')
sentiment = 0.99*sentiment + 0.01*t['sentiment']
# print(f'sentiment = {sentiment}')
self.mock_feels.clear_buffer()
# calc = 0*0.99**2 + 0.01*0.99*-0.7531 + 0.01*-0.5719
# = -0.01299649
self.mock_feels._latest_calc = self.mock_feels._feels.start
self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
# first observation is at 2017-2-19 19:14:18 and we are using default
# 60 second bins, therefore the observation at 2017-2-21 19:14:20 will
# never get saved but will always be recalculated.
self.assertEqual(self.mock_feels._latest_calc,
datetime(2017, 2, 21, 19, 14, 18))

# repeat the calculation, nothing changes
self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
self.assertEqual(self.mock_feels._latest_calc,
datetime(2017, 2, 21, 19, 14, 18))

def test_sentiments(self):
for t in self.mock_tweets:
self.feels_db.insert_tweet(t)
self.mock_feels.clear_buffer()
self.mock_feels.calc_every_n = 1
start = datetime(2017, 2, 19, 0, 0, 0)
dt = timedelta(days=1)
sentiment = self.mock_feels.sentiments(strt=start, delta_time=dt)
self.assertTrue(np.isclose(next(sentiment), 0))
self.assertTrue(np.isclose(next(sentiment), -0.007351))
self.assertTrue(np.isclose(next(sentiment), -0.01299649))
for s in sentiment:
print(s)
# we are starting at 2017-2-19 19:00:00 and using bins with length 1 day
# therefore our latest calc will be just prior to the final observation.
self.assertEqual(self.mock_feels._latest_calc,
datetime(2017, 2, 21, 0, 0, 0))
14 changes: 7 additions & 7 deletions test/test_listener.py
Expand Up @@ -41,29 +41,29 @@ def test_on_disconnect(self, mock_feels):
@patch('tweetfeels.TweetFeels')
def test_on_connect(self, mock_feels):
tl = TweetListener(mock_feels)
tl.waited = 60
tl._waited = 60
tl.on_connect()
self.assertEqual(tl.waited, 0)
self.assertEqual(tl._waited, 0)

@patch('tweetfeels.TweetFeels')
def test_on_error(self, mock_feels):
tl = TweetListener(mock_feels)
tl.reconnect_wait = MagicMock()
tl.on_error(420)
tl.reconnect_wait.assert_called_with('exponential')
self.assertEqual(tl.waited, 60)
self.assertEqual(tl._waited, 60)
mock_feels.on_error.assert_called_with(420)

@patch('tweetfeels.TweetFeels')
def test_reconnect_wait(self, mock_feels):
tl = TweetListener(mock_feels)
tl.waited = 0.1
tl._waited = 0.1
tl.reconnect_wait('linear')
self.assertEqual(tl.waited, 1.1)
tl.waited = 0.1
self.assertEqual(tl._waited, 1.1)
tl._waited = 0.1
tl.reconnect_wait('exponential')
tl.reconnect_wait('exponential')
self.assertEqual(tl.waited, 0.4)
self.assertEqual(tl._waited, 0.4)


class Test_Tweet(unittest.TestCase):
Expand Down

0 comments on commit 261b308

Please sign in to comment.