-
Notifications
You must be signed in to change notification settings - Fork 0
/
flickr_process.py
147 lines (121 loc) · 4.44 KB
/
flickr_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Process Flickr data for analysis.
- 5.3 Experiments on Real-World Network Data
We sample 50000 users in a period (20 days) that have following events, and
record all the new edges formed by them. Edges formed earlier are assumed to be more
preferable than later ones. We then sample 100 edges uniformly at random from
the dataset as the negative samples for each user. The testing set is sampled
in the same way, with the sampling period later for 20 days.
The data should be downloaded first with:
wget -4 http://socialnetworks.mpi-sws.mpg.de/data/flickr-growth.txt.gz ../data/
"""
import os
import random
from datetime import datetime, timedelta
import networkx as nx
import pandas as pd
from tqdm import tqdm
def collect_edge_features(i, j, alt=1):
subsubset = {}
if j in G.nodes:
deg = G.in_degree(j)
else:
deg = 0
try:
hops = nx.shortest_path_length(G, source=i, target=j)
except Exception:
hops = 'NA'
recip = 1 if G.has_edge(j, i) else 0
subsubset["deg"] = deg
subsubset["hops"] = hops
subsubset["recip"] = recip
subsubset["alt"] = alt
return subsubset
data_path = '../data'
# date to start sampling from
start_date = '2006-11-05'
end_date = '2006-11-25'
# file names
fn_in = data_path + '/flickr-growth.txt.gz'
fn_out = data_path + '/flickr-growth_choices_p_%s.csv' % \
datetime.strptime(start_date, '%Y-%m-%d').strftime('%y%m%d')
url = 'http://socialnetworks.mpi-sws.mpg.de/data'
# check if the input data has been downloaded yet
if not os.path.exists(fn_in):
print("[ERROR] Input data not found. Please download with:\n" +
" wget %s/flickr-growth.txt.gz %s/ " % (url, data_path))
# check if the output data exists already
if os.path.exists(fn_out):
print("[ERROR] Output data already exists! Please remove it to run.")
print("Reading raw data and creating graph.")
# read the edge list data
DF = pd.read_csv(fn_in, compression='gzip', header=0, sep='\t',
names=['from', 'to', 'ds'])
el_pre = DF[DF.ds < start_date]
el_pre = list(zip(el_pre['from'], el_pre['to'], el_pre['ds']))
el_post = DF[(DF.ds >= start_date) & (DF.ds < end_date)]
el_test = DF[DF.ds == end_date]
# create starting graph
G = nx.DiGraph()
G.add_edges_from([(x[0], x[1]) for x in el_pre])
print("Starting graph has %d nodes." % len(G))
num_samples = 60000
su = set(random.sample(list(el_post["from"].unique()), num_samples))
su = list(su.intersection(G.nodes))[:50000]
print("Collect train data.")
n = 0
ds = start_date
start_time = datetime.strptime(start_date, '%Y-%m-%d')
data = {i: [] for i in su}
while ds != end_date:
el_curr = el_post[el_post["ds"] == ds]
for i in tqdm(su, desc=ds):
for j in el_curr[el_curr["from"] == i]["to"].values:
data[i].append(collect_edge_features(i, j, 1 + n))
edges = el_curr[["from", "to"]].values.tolist()
G.add_edges_from(edges)
n += 1
ds = (start_time + timedelta(n)).strftime('%Y-%m-%d')
n_neg = 100
for i in tqdm(su):
neg_samples = random.sample(G.nodes, n_neg+10)
succs = set(G.successors(i))
neg_samples = list(set(neg_samples) - succs)[:n_neg]
for j in neg_samples:
data[i].append(collect_edge_features(i, j, 0))
entries = []
for idx, v in enumerate(data.values()):
for entry in v:
entry["index"] = idx
entries.append(entry)
pd.DataFrame(entries).to_csv("flickr_train.csv")
print("Collect test data.")
n_samples = 8000
n_neg = 100
samples = random.sample(list(DF[DF["ds"] == end_date]["from"].unique()), n_samples)
samples = list(set(samples).intersection(G.nodes))[:5000]
n = 0
ds = end_date
end_time = datetime.strptime(end_date, '%Y-%m-%d')
test_data = {i:[] for i in samples}
while n < 20:
el_curr = DF[DF["ds"] == ds]
for i in tqdm(samples, desc=ds):
for j in el_curr[el_curr["from"] == i]["to"].values:
test_data[i].append(collect_edge_features(i, j, 1))
edges = el_curr[["from", "to"]].values.tolist()
G.add_edges_from(edges)
n += 1
ds = (end_time + timedelta(n)).strftime('%Y-%m-%d')
n_neg = 100
for i in tqdm(samples):
neg_samples = random.sample(G.nodes, n_neg+10)
succs = set(G.successors(i))
neg_samples = list(set(neg_samples) - succs)[:n_neg]
for j in neg_samples:
test_data[i].append(collect_edge_features(i, j, 0))
entries = []
for idx, v in enumerate(test_data.values()):
for entry in v:
entry["index"] = idx
entries.append(entry)
pd.DataFrame(entries).to_csv("flickr_test.csv")