Skip to content

Commit

Permalink
Merge pull request #33 from uclatommy/release-0.4.0
Browse files Browse the repository at this point in the history
Release 0.4.0
  • Loading branch information
Thomas Chen, ASA committed Mar 13, 2017
2 parents cb0e48e + 86ec8eb commit f638b08
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 73 deletions.
34 changes: 23 additions & 11 deletions README.md
Expand Up @@ -67,7 +67,7 @@ login = [consumer_key, consumer_secret, access_token, access_token_secret]
>>> trump_feels = TweetFeels(login, tracking=['trump'])
>>> trump_feels.start(10)
Timer completed. Disconnecting now...
>>> trump_feels.sentiment
>>> trump_feels.sentiment.value
-0.0073007430343252711
```

Expand All @@ -79,7 +79,7 @@ Timer completed. Disconnecting now...
>>> def print_feels(seconds=10):
... while go_on:
... time.sleep(seconds)
... print(f'[{time.ctime()}] Sentiment Score: {trump_feels.sentiment}')
... print(f'[{time.ctime()}] Sentiment Score: {trump_feels.sentiment.value}')
...
>>> go_on = True
>>> t = Thread(target=print_feels)
Expand Down Expand Up @@ -126,7 +126,6 @@ Timer completed. Disconnecting now...

```python
>>> tesla_feels = TweetFeels(login, tracking=['tesla', 'tsla', 'gigafactory', 'elonmusk'], db='tesla.sqlite')
>>> tesla_feels.calc_every_n = 10
>>> t = Thread(target=print_feels, args=(tesla_feels, 120))
>>> tesla_feels.start()
>>> t.start()
Expand All @@ -140,26 +139,39 @@ Timer completed. Disconnecting now...
[Mon Feb 20 17:53:16 2017] Sentiment Score: 0.2485916177213093
```

#### Use the sentiments generator to replay captured data and plot
```python
import pandas as pd
from datetime import timedelta, datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
data1 = {s.end: s.value for s in tesla_feels.sentiments(delta_time=timedelta(minutes=15), nans=True)}
data2 = {s.end: s.volume for s in tesla_feels.sentiments(delta_time=timedelta(minutes=15), nans=True)}
df1 = pd.DataFrame.from_dict(data1, orient='index')
df2 = pd.DataFrame.from_dict(data2, orient='index')
fig, axes = plt.subplots(nrows=2, ncols=1)
fig.set_size_inches(15, 5)
plt.subplot(211).axes.get_xaxis().set_visible(False)
df1[0].plot(kind='line', title='Tesla Sentiment')
plt.subplot(212)
df2[0].plot(kind='area', title='Volume')
```
<image src="https://uclatommy.github.io/tweetfeels/images/volume.svg" width="100%" height="300">

# Methodology
There are a multitude of ways in which you could combine hundreds or thousands of tweets across time in order to calculate a single sentiment score. One naive method might be to bin tweets into discretized time-boxes. For example, perhaps you average the individual sentiment scores every 10 seconds so that the current sentiment is the average over the last 10 seconds. In this method, your choice of discretization length is arbitrary and will have an impact on the perceived variance of the score. It also disregards any past sentiment calculations.

To correct for these effects, we time-box every second and do not discard the sentiment from prior calculations. Instead, we phase out older tweet sentiments geometrically as we add in new tweets:
To correct for these effects, we time-box every minute by default and do not discard the sentiment from prior calculations. Instead, we phase out older tweet sentiments geometrically as we add in new tweets:

![f1]

Where ![f2] is the aggregate sentiment at time t, ![f3] is the sentiment score for the current time-box, and ![f5] is the fall-off factor between 0 and 1. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's follower count as the measure of influence.
Where ![f2] is the aggregate sentiment at time t, ![f3] is the sentiment score for the current time-box, and ![f5] is the fall-off factor between 0 and 1. We start the calculation with ![f4], which is why you will see the sentiment score move away from zero until it stabilizes around the natural value. Within each time-box we are using a weighted average of sentiment scores. For each tweet, we utilize the associated user's followers and friends count as the measure of influence.

Some tweets will also have a neutral score (0.0). In these cases, we exclude it from aggregation.

Here's an example of different model parameterizations of real-time Tesla sentiment:
<image src="https://uclatommy.github.io/tweetfeels/images/tesla-sentiment.svg" width="100%" height="300">

A time series can be generated iterating over ``TweetFeels.sentiments`` and creating a dictionary of values with the timestamp as the key. You can then load the dictionary into a pandas dataframe:
```python
data1 = {tesla_feels._latest_calc: s for s in tesla_feels.sentiments(delta_time=timedelta(minutes=5))}
df = pd.DataFrame.from_dict(data1, orient='index')
```

[f1]: http://chart.apis.google.com/chart?cht=tx&chl=S_{t}=%5calpha{S_{t-1}}%2B(1-%5calpha)s_t
[f2]: http://chart.apis.google.com/chart?cht=tx&chl=S_t
[f3]: http://chart.apis.google.com/chart?cht=tx&chl=s_t
Expand Down
60 changes: 44 additions & 16 deletions test/test_data.py
Expand Up @@ -23,13 +23,13 @@ def setUp(self):
{'created_at': 'Sun Feb 20 19:14:19 +0000 2017',
'id_str': '833394296418082818',
'text': 'Fake news. Sad!',
'user': {'followers_count': '100', 'friends_count': '200',
'user': {'followers_count': '200', 'friends_count': '200',
'location':None}
}, # sentiment value = -0.7351
{'created_at': 'Sun Feb 21 19:14:20 +0000 2017',
'id_str': '833394296418082819',
'text': 'I hate it.',
'user': {'followers_count': '100', 'friends_count': '200',
'user': {'followers_count': '200', 'friends_count': '200',
'location':None}
} # sentiment value = -0.5719
]
Expand Down Expand Up @@ -91,12 +91,40 @@ def test_fetch(self):

it = self.feels_db.fetchbin(binsize=timedelta(minutes=30))
cur = next(it)
self.assertEqual(cur[2]-cur[1], timedelta(minutes=30))
self.assertEqual(len(cur[0]), 103)
self.assertEqual(cur.end-cur.start, timedelta(minutes=30))
self.assertEqual(len(cur), 103)
cur = next(it)
self.assertEqual(len(cur), 1)
cur = next(it)
self.assertEqual(len(cur), 1)

def test_empty(self):
for t in self.mock_tweets:
self.feels_db.insert_tweet(t)
it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True)
cur = next(it)
self.assertEqual(len(cur), 1)
cur = next(it)
self.assertEqual(len(cur), 0)
cur = next(it)
self.assertEqual(len(cur), 1)
cur = next(it)
cur = next(it)
self.assertEqual(len(cur), 1)

def test_bin(self):
for t in self.mock_tweets:
self.feels_db.insert_tweet(t)
it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True)
cur = next(it)
self.assertEqual(cur.influence, 300)
cur = next(it)
self.assertEqual(cur.influence, 0)
cur = next(it)
self.assertEqual(cur.influence, 400)
cur = next(it)
self.assertEqual(len(cur[0]), 1)
cur = next(it)
self.assertEqual(len(cur[0]), 1)
self.assertEqual(cur.influence, 400)

def test_data_operation(self):
twt = {'created_at': 'Sun Feb 19 19:14:18 +0000 2017',
Expand All @@ -105,21 +133,21 @@ def test_data_operation(self):
t = Tweet(twt)
self.assertEqual(len(t.keys()), 7)
self.feels_db.insert_tweet(t)
df = self.feels_db.tweets_since(datetime.now())
self.assertEqual(len(df), 0)
df = self.feels_db.tweets_since(0)
self.assertEqual(len(df), 1)
df.sentiment = 0.9
for row in df.itertuples():
b = self.feels_db.tweets_since(datetime.now())
self.assertEqual(len(b), 0)
b = self.feels_db.tweets_since(0)
self.assertEqual(len(b), 1)
b.df.sentiment = 0.9
for row in b.df.itertuples():
self.feels_db.update_tweet(
{'id_str': row.id_str, 'sentiment': row.sentiment}
)

start = datetime(2017, 2, 17, 0, 0, 0)
before = datetime(2017, 2, 18, 0, 0, 0)
after = datetime(2017, 2, 20, 0, 0, 0)
df = self.feels_db.tweets_between(start, before)
self.assertEqual(len(df), 0)
b = self.feels_db.tweets_between(start, before)
self.assertEqual(len(b), 0)

df = self.feels_db.tweets_between(start, after)
self.assertEqual(len(df), 1)
b = self.feels_db.tweets_between(start, after)
self.assertEqual(len(b), 1)
34 changes: 24 additions & 10 deletions test/test_feels.py
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from datetime import datetime, timedelta

from tweetfeels import (TweetFeels, Tweet, TweetData)
from tweetfeels import (TweetFeels, Tweet, TweetData, Sentiment)


class Test_Feels(unittest.TestCase):
Expand Down Expand Up @@ -86,11 +86,11 @@ def test_on_data(self):
def test_sentiment(self):
mock_feels = TweetFeels("abcd")
mock_feels._feels.tweets_since = MagicMock(return_value=[])
mock_feels._sentiment = 0.5
mock_feels._sentiment = Sentiment(0.5, 0, 0, 0)
mock_feels._latest_calc = datetime(2017, 1, 1, 0, 0, 0)
mock_feels._feels.start = datetime(2017, 1, 1, 0, 0, 0)
mock_feels._feels.end = datetime(2017, 1, 1, 0, 0, 0)
self.assertEqual(mock_feels.sentiment, 0.5)
self.assertEqual(mock_feels.sentiment.value, 0.5)

def test_buffer(self):
mock_feels = TweetFeels('abcd')
Expand Down Expand Up @@ -120,17 +120,19 @@ def test_sentiment_comprehensive(self):
# calc = 0*0.99**2 + 0.01*0.99*-0.7531 + 0.01*-0.5719
# = -0.01299649
self.mock_feels._latest_calc = self.mock_feels._feels.start
self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
self.assertTrue(np.isclose(self.mock_feels.sentiment.value, sentiment))
# first observation is at 2017-2-19 19:14:18 and we are using default
# 60 second bins, therefore the observation at 2017-2-21 19:14:20 will
# never get saved but will always be recalculated.
self.assertEqual(self.mock_feels._latest_calc,
datetime(2017, 2, 21, 19, 14, 0))

# repeat the calculation, nothing changes
self.assertTrue(np.isclose(self.mock_feels.sentiment, sentiment))
self.assertTrue(np.isclose(self.mock_feels.sentiment.value, sentiment))
self.assertEqual(self.mock_feels._latest_calc,
datetime(2017, 2, 21, 19, 14, 0))
self.assertEqual(self.mock_feels.sentiment.start,
self.mock_feels._latest_calc)

def test_sentiment_factor(self):
sentiment = 0.0
Expand All @@ -141,7 +143,7 @@ def test_sentiment_factor(self):

# calc = 0*0.75**2 + 0.25*0.75*-0.7531 + 0.25*-0.5719
# = -0.28418125
mock_sentiment = self.mock_feels.sentiment
mock_sentiment = self.mock_feels.sentiment.value
self.assertTrue(np.isclose(mock_sentiment, sentiment))

def test_sentiment_binsize(self):
Expand All @@ -152,16 +154,28 @@ def test_sentiment_binsize(self):

self.mock_feels.factor = 0.75
self.mock_feels.binsize = timedelta(days=2.5)
mock_sentiment = self.mock_feels.sentiment
mock_sentiment = self.mock_feels.sentiment.value
self.assertTrue(np.isclose(mock_sentiment, sentiment))

def test_nans(self):
sentiments = self.mock_feels.sentiments(
delta_time=timedelta(hours=24), nans=True)
s = next(sentiments)
self.assertEqual(s.value, 0)
s = next(sentiments)
self.assertTrue(np.isnan(s.value)) # can return nans
# does not affect current sentiment
self.assertEqual(self.mock_feels._sentiment.value, 0)
s = next(sentiments)
self.assertTrue(s.value<0)

def test_sentiments(self):
start = datetime(2017, 2, 19, 0, 0, 0)
dt = timedelta(minutes=30)
sentiment = self.mock_feels.sentiments(strt=start, delta_time=dt)
self.assertTrue(np.isclose(next(sentiment), 0))
self.assertTrue(np.isclose(next(sentiment), -0.007351))
self.assertTrue(np.isclose(next(sentiment), -0.01299649))
self.assertTrue(np.isclose(next(sentiment).value, 0))
self.assertTrue(np.isclose(next(sentiment).value, -0.007351))
self.assertTrue(np.isclose(next(sentiment).value, -0.01299649))
for s in sentiment:
print(s)
# we are starting at 2017-2-19 19:00:00 and using bins with length 30
Expand Down
8 changes: 5 additions & 3 deletions tweetfeels/__init__.py
@@ -1,4 +1,6 @@
from .tweetdata import TweetData
from .tweetdata import TweetData, TweetBin
from .tweetlistener import TweetListener, Tweet
from .tweetfeels import TweetFeels
__all__ = ['TweetFeels', 'TweetListener', 'TweetData', 'Tweet']
from .tweetfeels import TweetFeels, Sentiment
__all__ = ['TweetFeels', 'TweetListener', 'TweetData', 'Tweet', 'TweetBin',
'Sentiment'
]
72 changes: 63 additions & 9 deletions tweetfeels/tweetdata.py
Expand Up @@ -4,6 +4,53 @@
import logging
from datetime import datetime, timedelta


class TweetBin(object):
"""
A container for a time-box of tweets. It includes information regarding the
upper and lower datetime boundaries for the bin.
:param df: The data associated with a bin.
:type df: DataFrame
:param lower: The lower bound of the bin.
:type lower: datetime
:param upper: The upper bound of the bin.
:type upper: datetime
:ivar df: The dataframe containing tweet data for the bin.
:ivar influence: A measure of total tweet influence associated with the bin.
:ivar start: The beginning datetime for the bin.
:ivar end: The ending datetime for the bin.
"""
def __init__(self, df, lower, upper):
self._df = df
self._lower = lower
self._upper = upper

@property
def start(self):
return self._lower

@property
def end(self):
return self._upper

@property
def df(self):
return self._df

@property
def influence(self):
ret = 0
if(len(self._df)>0):
ret = (self._df['followers_count'].sum() +
self._df['friends_count'].sum())
return ret

def __len__(self):
return len(self._df)


class TweetData(object):
"""
Models the tweet database.
Expand Down Expand Up @@ -74,7 +121,8 @@ def all(self):
)
return df

def fetchbin(self, start=None, end=None, binsize=timedelta(seconds=60)):
def fetchbin(self, start=None, end=None, binsize=timedelta(seconds=60),
empty=False):
"""
Returns a generator that can be used to iterate over the tweet data
based on ``binsize``.
Expand All @@ -85,6 +133,10 @@ def fetchbin(self, start=None, end=None, binsize=timedelta(seconds=60)):
:type end: datetime
:param binsize: Time duration for each bin for tweet grouping.
:type binsize: timedelta
:param empty: Determines whether empty dataframes will be yielded.
:type empty: boolean
:returns: A dataframe along with time boundaries for the data.
:rtype: tuple
"""
second = timedelta(seconds=1)
if start is None: start=self.start-second
Expand All @@ -93,21 +145,23 @@ def fetchbin(self, start=None, end=None, binsize=timedelta(seconds=60)):
df = self.tweet_dates
df = df.groupby(pd.TimeGrouper(freq=f'{int(binsize/second)}S')).size()
df = df[df.index > start - binsize]
df = df[df != 0]
if not empty: df = df[df != 0]
conn = sqlite3.connect(self._db, detect_types=sqlite3.PARSE_DECLTYPES)
c = conn.cursor()
c.execute(
"SELECT * FROM tweets WHERE created_at > ? AND created_at <= ?",
(start, end)
)
for i in range(0,len(df)):
frame = pd.DataFrame.from_records(
data=c.fetchmany(df.iloc[i]), columns=self.fields,
index='created_at'
)
frame = []
if df.iloc[i] > 0:
frame = pd.DataFrame.from_records(
data=c.fetchmany(df.iloc[i]), columns=self.fields,
index='created_at'
)
left = df.index[i].to_pydatetime()
right = left + binsize
if len(frame)>0: yield (frame, left, right)
if len(frame)>0 or empty: yield TweetBin(frame, left, right)
c.close()

def tweets_since(self, dt):
Expand All @@ -122,7 +176,7 @@ def tweets_since(self, dt):
'SELECT * FROM tweets WHERE created_at > ?', conn, params=(dt,),
parse_dates=['created_at']
)
return df
return TweetBin(df, dt, datetime.now())

def tweets_between(self, start, end):
"""
Expand All @@ -139,7 +193,7 @@ def tweets_between(self, start, end):
'SELECT * FROM tweets WHERE created_at > ? AND created_at <= ?',
conn, params=(start, end), parse_dates=['created_at']
)
return df
return TweetBin(df, start, end)

def make_feels_db(self, filename='feels.sqlite'):
"""
Expand Down

0 comments on commit f638b08

Please sign in to comment.