-
Notifications
You must be signed in to change notification settings - Fork 6
/
main_classify.m
116 lines (93 loc) · 3.92 KB
/
main_classify.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
% Main script to get classification of Authorship Attribution (AA) project
% using NPLM
%
% Zhenhao (Roger) Ge, 2015-08-25
%% Initialization
% set up directory
directory.work = pwd;
directory.root = fileparts(directory.work);
directory.data = [directory.root, '\data'];
directory.stem = [directory.data, '\stem'];
directory.split = [directory.data, '\split'];
directory.mat = [directory.data, '\mat'];
directory.lm = [directory.data, '\lm'];
directory.result = [directory.root, '\results'];
% find data file names
textnames = getfile(directory.stem, 'txt');
num.set = length(textnames);
% set dataset parameters
num.gram = 4; % specify grams (# of context + target wrods)
num.seed = 5;
% set other parameters
add_sil = 0; % flag of adding silence in sentence extraction
rm_sil = 0; % flag of removing silence after sentence extraction
group_len_max = 50;
refresh = 0;
%% classification over course and seed
show_result = 0;
for i = 1:num.set
% specify set index
set_idx = i;
% get course and instructor info
textname = textnames{set_idx};
[~, course_instructor] = fileparts(textname);
for j = 1:num.seed
% specify seed
seed_int = j - 1;
% specify the ppl file
file.score = [directory.result, '\scores\', course_instructor, ...
'_test_rand', num2str(seed_int, '%02d'), '.mat'];
if exist(file.score, 'file')
disp(['loading existed ', file.score, ' ...']),
load(file.score);
else
disp(['creating new ', file.score, ' ...']),
% get test sentences and its number
file.test = [directory.split, '\', course_instructor, ...
'_test_rand', num2str(seed_int, '%02d'), '.txt'];
sentences = extract_sentences(file.test, add_sil);
num.sent = length(sentences);
% compute ppl
ppl = zeros(num.sent, num.set);
timecost = zeros(num.sent, 1);
for m = 1:num.sent
tStart = tic;
for n = 1:num.set
[~, ci] = fileparts(textnames{n});
file.lm = [directory.lm, '\', ci, '_rand', num2str(seed_int, ...
'%02d'), '_', num2str(num.gram), 'gram_lm.mat'];
load(file.lm),
ppl(m, n) = seq_ppl(sentences{m}, model, show_result);
end
timecost(m) = toc(tStart);
end
% save ppl for current set and seed
save(file.score, 'ppl', 'timecost');
end
% generate nbest and confusion results when necessary
nbest = zeros(num.set, group_len_max);
confusion = zeros(num.set, group_len_max);
file.nbest = strrep(file.score, '\scores\', '\nbest\');
file.confusion = strrep(file.score, '\scores\', '\confusion\');
if ~(exist(file.nbest, 'file') && exist(file.confusion, 'file')) ...
|| (refresh == 1)
for k = 1:group_len_max
ppl_agg = aggregate(ppl, k);
if ~exist(file.nbest, 'file') || refresh == 1
nbest(:,k) = nbest_accuracy(ppl_agg, set_idx);
end
if ~exist(file.confusion, 'file') || refresh == 1
confusion(:,k) = confusion_array(ppl_agg);
end
end
if ~exist(file.nbest, 'file') || refresh == 1
disp(['saving ', file.nbest, ' ...']),
save(file.nbest, 'nbest');
end
if ~exist(file.confusion, 'file') || refresh == 1
disp(['saving ', file.confusion, ' ...']),
save(file.confusion, 'confusion');
end
end
end
end