This repository has been archived by the owner on Jul 27, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
link_counter.py
96 lines (70 loc) · 3.14 KB
/
link_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#This code takes a list of twitter usernames, iterates over them to find tweets where they shared links,
#and then sums up the base URLs of everyones links combined and turns it into a matplotlib graph.
#I put a bunch of code documentation in and it really will help you use this.
#the code does take a bit to run depending on your tweet limit and how many accounts you pull
import pandas as pd
import re
from urllib.parse import urlparse
from urllib.request import urlopen
import csv
import twint #you may need to install this first if you haven't!
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
#this prevents async problems/ runtime errors
#https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/
import nest_asyncio
nest_asyncio.apply()
#put accounts in between the brackets, comma seperated, without the @sign. ie ["jack", "realDonaldtrump", "Blacksocialists"]
sourceAccounts= ["PUT YOUR ACCOUNTS HERE" , "DIRECTIONS ABOVE"]
if not os.path.isfile('all_urls.csv'):
with open('all_urls.csv', 'wb') as f:
pass
for username in sourceAccounts:
c = twint.Config()
print("pulling tweets for " + str(username) + "...")
c.Username = username
c.Hide_output = True #makes the command line less noisy
c.Limit = 500 #maximum number of tweets to pull per account
c.Store_object = True
#only selects tweets that have links
c.Links = "include"
baseURLs = []
twint.run.Search(c)
tweets = twint.output.tweets_list
for tweet in tweets:
#urls is a class in the twint tweet objects to see all classes: dir(tweet)
for URL in tweet.urls:
parsed_uri = urlparse(URL)
baseURL = str('{uri.netloc}'.format(uri=parsed_uri)) #gets the base URL
if baseURL[:7] == 'twitter': #ignores RTs as links
pass
elif baseURL[:4] == "www.": #strips www for a e s t h e t i c
baseURLs.append([username, baseURL[4:]])
else:
baseURLs.append([username, baseURL])
# I added this in case it gets slow in pulling the list so you can stop at any point and then just
#edit your sourceAccounts list to get rid of the one's you've already done.
with open('all_urls.csv','a', newline='') as f:
for baseURL in baseURLs:
writer = csv.writer(f)
writer.writerow(baseURL)
all_urls = pd.read_csv('all_urls.csv', names = ['username','URL'])
print("total tweets pulled: " + str(len(all_urls)))
labels = ['Base URL', 'Frequency']
countedURLs = all_urls['URL'].value_counts()
countedURLs.to_csv('countedURLs.csv')
top_urls = countedURLs.iloc[:10]
top_urls = top_urls[::-1] #makes it descending
y_pos = np.arange(len(top_urls))
performance = top_urls
print(performance)
baseURLs = top_urls.index
print(baseURLs)
plt.barh(y_pos, performance, align='center', alpha=0.5)
plt.yticks(y_pos, baseURLs)
plt.xlabel('Frequency of Links')
plt.title('Most Frequent External Links of all Handles Tested')
plt.show()