-
Notifications
You must be signed in to change notification settings - Fork 0
/
mb_nodesets_01.py
187 lines (141 loc) · 5.66 KB
/
mb_nodesets_01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# mb_nodesets_01.py
# Version a01
# by jmg - j.gagen*AT*gold*DOT*ac*DOT*uk
# Nov 8th 2017
# Licence: http://creativecommons.org/licenses/by-nc-sa/3.0/
# Source code at: https://github.com/pha5echange/eng-tools
# Examines time-sliced MB 'Omega Year' genre lists (from 'ts_data/')
# Creates a set for each genre, and adds artist info. as elements
# Finds shared artists to facilitate node-list generation
# Writes weighted nodelist
# Run AFTER 'mb_timeslicer.py'
# import packages
import os
import resource
from collections import OrderedDict
from datetime import datetime
versionNumber = ("a01")
# open file for writing log
logPath = os.path.join("logs", 'mb_nodesets_' + versionNumber + '_log.txt')
runLog = open(logPath, 'a')
# ..and begin..
print ('\n' + 'MB Genre Data Node Set Maker | ' + 'Version: ' + versionNumber + ' | Starting' + '\n')
runLog.write ('MB Genre Data Node Set Maker | ' + 'Version: ' + versionNumber + '\n' + '\n')
# Initiate timing of run
runDate = datetime.now()
startTime = datetime.now()
# Look for subfolders in `ts_data' and generate datelist from these
dateList = []
datePath = "ts_data/"
dateList = os.listdir(datePath)
dateSet = set(dateList)
# initialise global counters
totalIntersectCount = 0
totalElementCount = 0
for date in dateSet:
dateIP = int(date)
omegaYear = str(dateIP)
# define path to 'genres' subdirectory
omegaGenrePath = os.path.join("ts_data", omegaYear, "genres")
fileNames = os.listdir(omegaGenrePath)
# open files for data output
intersectDataPath = os.path.join("ts_data", omegaYear, omegaYear + '_genre_intersects.txt')
intersectData = open(intersectDataPath, 'w')
# Data file for weighted, undirected graph
wuGraphDataPath = os.path.join("ts_data", omegaYear, omegaYear + '_wuGraph_data.txt')
wuGraphData = open(wuGraphDataPath, 'w')
# create empty lists for genre set contents and labels, an empty set to hold intersections and a counter
setList = []
setNameList = []
intersectionSet = set()
genreCount = 0
if dateIP != 0:
# open files for reading
for index in range(len(fileNames)):
# look for files in 'genres' subfolder
pathname = os.path.join(omegaGenrePath, fileNames[index])
genreFile = str(fileNames[index])
genreLabel, fileExtension = genreFile.split(".")
dataInput = open(pathname, "r")
# create set for genre, and name as `genreLabel'
setName = genreLabel
setName = set()
# read lines from the file and assign each to the set as an element
for line in dataInput:
content = str(line.strip("\n"))
setName.add (content)
# close input file
dataInput.close()
# add genre set to setList to facilitate intersection
setList.append(setName)
# add genreLabel to setNameList to facilitate labelling
setNameList.append(genreLabel)
# increment counter
genreCount += 1
# do intersections here
setAcount = 0
setBcount = 0
intersectCount = 0
while setAcount < genreCount:
setAlabel = str(setNameList[setAcount]).replace(" ", "").replace("'","")
setA = set(setList[setAcount])
while setBcount < genreCount:
setBlabel = str(setNameList[setBcount]).replace(" ", "").replace("'","")
setB = set(setList[setBcount])
if setBcount < setAcount:
setBcount += 1
totalIntersectCount += 1
else:
if setAlabel == setBlabel:
setBcount +=1
totalIntersectCount += 1
else:
intersectSet = setA.intersection(setB)
if intersectSet:
elementCount = len(intersectSet)
intersectionStr = str(intersectSet)
print ('\n' + 'Intersection of ' + setAlabel + ' and ' + setBlabel + ': ' + 'Elements: ' + str(elementCount) + ' SetAcount: ' + str(setAcount) + ' setBcount: ' + str(setBcount))
# print (intersectionStr)
runLog.write ('Intersection of ' + setAlabel + ' and ' + setBlabel + ': ' + 'Elements: ' + str(elementCount) + ' SetAcount: ' + str(setAcount) + ' setBcount: ' + str(setBcount) + '\n')
if setAlabel:
# For full data file,make circumflex ('^') seperator, to avoid problems with sets() later
intersectData.write (setAlabel + '^' + setBlabel + '^' + str(elementCount) + '^' + intersectionStr + '\n')
# Data file for weighted, undirected graph
wuGraphData.write (setAlabel + ',' + setBlabel + ',' + str(elementCount) + '\n')
intersectCount += 1
totalIntersectCount += 1
totalElementCount += elementCount
setBcount += 1
else:
# write setA only to enable nodes with no connections
wuGraphData.write (setAlabel + ',' + setAlabel + ',' + "0" + '\n')
setBcount += 1
wuGraphData.write (setBlabel + ',' + setBlabel + ',' + "0" + '\n')
else:
setBcount = 0
setAcount += 1
if setList:
if not intersectSet:
wuGraphData.write (setAlabel + ',' + setAlabel + ',' + "0" + '\n')
# Close files
intersectData.close()
wuGraphData.close()
memUseMb = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / 1048576
# End timing of run
endTime = datetime.now()
# write to log
runLog.write ('\n' + 'Run Information' + '\n' + '\n')
runLog.write ('Total Intersections: ' + str(totalIntersectCount) + '\n')
runLog.write ('Total Co-occurrence: ' + str(totalElementCount) + '\n')
runLog.write ('Memory Used: ' + str(memUseMb) + 'Mb' + '\n')
runLog.write ('Date of run: {}'.format(runDate) + '\n')
runLog.write ('Duration of run : {}'.format(endTime - startTime) + '\n' + '\n')
runLog.close()
# write to screen
print ('\n' + 'Run Information' + '\n')
print ('Version: ' + versionNumber)
print ('Total Intersections: ' + str(totalIntersectCount))
print ('Total Co-occurrence: ' + str(totalElementCount))
print ('Memory Used: ' + str(memUseMb) + 'Mb')
print ('Date of run: {}'.format(runDate))
print ('Duration of run : {}'.format(endTime - startTime))