/
detect_communities.py
78 lines (62 loc) · 2.14 KB
/
detect_communities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import igraph as ig
from collections import defaultdict
import numpy as np
import os
import math
from pathlib import Path
from contextlib import redirect_stdout
import sys
import logging
import pickle
import leidenalg as la
def strip_dates(e):
splits = e.split('-')
if len(splits) > 1:
return_str = splits[len(splits)-1]
else:
return_str = e
#if len(return_str.split(',')) != 3:
# logging.error("Wrong number of fields: (" + e + ") split to: " + return_str)
return return_str
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S")
logging.info("reading dict")
file = open("D:/canada.pickle",'rb')
links = pickle.load(file)
file.close()
logging.info("read dict")
logging.info("building graph")
g = ig.Graph.TupleList([(k, v) for k, vs in links.items() for v in vs])
logging.info("built graph")
g.vs["weight"] = 1
g.es["weight"] = 1
logging.info("partitioning")
#part = g.community_leiden(objective_function='CPM', resolution_parameter=0.0001)
part = la.find_partition(g, la.CPMVertexPartition, resolution_parameter = 0.000004, n_iterations=10) # try 3?
#part = la.find_partition(g, la.RBConfigurationVertexPartition, resolution_parameter = 0.05)
#part = la.find_partition(g, la.ModularityVertexPartition, n_iterations=50)
g = None
logging.info("partitioned")
# cluster_graph = part.cluster_graph(
# combine_vertices={
# "weight": "sum",
# },
# combine_edges={
# "weight": "sum",
# },
# )
# cluster_graph.write_picklez("D:/CPM_no_adjacency_0000065.pz")
# cluster_graph = None
# logging.info("wrote overall cluster graph")
directory = "D:/canada_communities_CPM"
os.makedirs(directory, exist_ok=True)
for i in range(100):
if i >= len(part):
break
logging.info("parsing subgraph " + str(i))
g1 = part.subgraph(i)
logging.info("writing cluster - " + str(i))
stripped_user_list = [ele for ele in g1.vs()["name"] if isinstance(ele, str)]
np.savetxt(directory + "/cluster users " + str(i) + ".txt", stripped_user_list, fmt="%s")
logging.info("wrote subgraph " + str(i))