Skip to content

Commit e3a9516

Browse files
committed
Hd tests
Series of different tests IN HD space
1 parent 13d2ba7 commit e3a9516

12 files changed

+187193
-20549
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
*.csv
3+
updated_semhash_pipeline.ipynb
4+
semhash_pipeline.ipynb
5+
*.csv
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

.ipynb_checkpoints/SemHash_V_HDvec-checkpoint.ipynb

Lines changed: 67154 additions & 0 deletions
Large diffs are not rendered by default.

HD_ngrams.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Tue Jan 22 19:13:43 2019
5+
6+
@author: denkle
7+
"""
8+
import numpy as np
9+
10+
def ngram_encode(str, HD_aphabet, aphabet, n_size): # method for mapping n-gram statistics of a word to an N-dimensional HD vector
11+
HD_ngram = np.zeros(HD_aphabet.shape[1]) # will store n-gram statistics mapped to HD vector
12+
full_str = '#' + str + '#' # include extra symbols to the string
13+
14+
#adjust the string for n-gram size
15+
if n_size == 1:
16+
full_str_e=full_str
17+
else:
18+
full_str_e=full_str[:-(n_size-1)]
19+
20+
for il, l in enumerate(full_str_e): # loops through all n-grams
21+
hdgram = HD_aphabet[aphabet.find(full_str[il]), :] # picks HD vector for the first symbol in the current n-gram
22+
23+
for ng in range(1, n_size): #loops through the rest of symbols in the current n-gram
24+
hdgram = hdgram * np.roll(HD_aphabet[aphabet.find(full_str[il+ng]), :], ng) # two operations simultaneously; binding via elementvise multiplication; rotation via cyclic shift
25+
26+
HD_ngram += hdgram # increments HD vector of n-gram statistics with the HD vector for the currently observed n-gram
27+
28+
HD_ngram_norm = np.sqrt(HD_aphabet.shape[1]) * (HD_ngram/ np.linalg.norm(HD_ngram) ) # normalizes HD-vector so that its norm equals sqrt(N)
29+
return HD_ngram_norm # output normalized HD mapping
30+
31+
32+
33+
N = 1000 # set the desired dimensionality of HD vectors
34+
n_size=3 # n-gram size
35+
aphabet = 'abcdefghijklmnopqrstuvwxyz#' #fix the alphabet. Note, we assume that capital letters are not in use
36+
np.random.seed(1) # for reproducibility
37+
HD_aphabet = 2 * (np.random.randn(len(aphabet), N) < 0) - 1 # generates bipolar {-1, +1}^N HD vectors; one random HD vector per symbol in the alphabet
38+
39+
str='jump' # example string to represent using n-grams
40+
HD_ngram = ngram_encode(str, HD_aphabet, aphabet, n_size) # HD_ngram is a projection of n-gram statistics for str to N-dimensional space. It can be used to learn the word embedding

HD_ngrams_binary.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Tue Jan 22 19:13:43 2019
5+
6+
@author: denkle
7+
"""
8+
import numpy as np
9+
10+
def ngram_encode_bin(str, HD_aphabet, aphabet, n_size): # method for mapping n-gram statistics of a word to an N-dimensional HD vector
11+
HD_ngram = np.zeros(HD_aphabet.shape[1]) # will store n-gram statistics mapped to HD vector
12+
full_str = '#' + str + '#' # include extra symbols to the string
13+
14+
#adjust the string for n-gram size
15+
16+
if n_size == 1:
17+
full_str_e=full_str
18+
else:
19+
full_str_e=full_str[:-(n_size-1)]
20+
21+
for il, l in enumerate(full_str_e): # loops through all n-grams
22+
hdgram = HD_aphabet[aphabet.find(full_str[il]), :] # picks HD vector for the first symbol in the current n-gram
23+
24+
for ng in range(1, n_size): #loops through the rest of symbols in the current n-gram
25+
hdgram = np.logical_xor( hdgram, np.roll(HD_aphabet[aphabet.find(full_str[il+ng]), :], ng)) # two operations simultaneously; binding via XOR; rotation via cyclic shift
26+
27+
HD_ngram += hdgram # increments HD vector of n-gram statistics with the HD vector for the currently observed n-gram
28+
29+
HD_ngram_norm =HD_ngram/len(full_str_e) # normalizes HD-vector by number of n-grams. So value in each position is between 0 and 1
30+
return HD_ngram_norm # output normalized HD mapping
31+
32+
N = 1000 # set the desired dimensionality of HD vectors
33+
n_size=2 # n-gram size
34+
aphabet = 'abcdefghijklmnopqrstuvwxyz#' #fix the alphabet. Note, we assume that capital letters are not in use
35+
np.random.seed(1) # for reproducibility
36+
HD_aphabet = (np.random.randn(len(aphabet), N) < 0) # generates binary {0, 1}^N HD vectors; one random HD vector per symbol in the alphabet
37+
38+
str='jump' # example string to represent using n-grams
39+
HD_ngram = ngram_encode_bin(str, HD_aphabet, aphabet, n_size) # HD_ngram is a projection of n-gram statistics for str to N-dimensional space. It can be used to learn the word embedding

HD_ngrams_mod.py

Lines changed: 90 additions & 0 deletions
Large diffs are not rendered by default.

Multiclass and multi lables.ipynb

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 7,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"Confusion matrix for label B:\n",
13+
"[[2 2]\n",
14+
" [2 1]]\n",
15+
"Confusion matrix for label T:\n",
16+
"[[1 2]\n",
17+
" [2 2]]\n",
18+
"Confusion matrix for label D:\n",
19+
"[[2 1]\n",
20+
" [2 2]]\n",
21+
"Confusion matrix for label C:\n",
22+
"[[5 2]\n",
23+
" [0 0]]\n"
24+
]
25+
}
26+
],
27+
"source": [
28+
"import numpy as np\n",
29+
"from sklearn.metrics import confusion_matrix\n",
30+
"# from sklearn.metrics import multilabel_confusion_matrix\n",
31+
"\n",
32+
"y_true = np.array([[1,0,0, 0], [1,0,0, 0], [1,0,0, 0], [0,1,1,0], [0,1,1,0], [0,1,1,0], [0,1,1,0]])\n",
33+
"y_pred = np.array([[1,0,0, 0], [0,1,0, 0], [0,1,1, 0], [1,0,1,0], [1,0,0,1], [0,1,0,1], [0,1,1,0]])\n",
34+
"\n",
35+
"labels = [\"B\", \"T\", \"D\", \"C\"]\n",
36+
"\n",
37+
"conf_mat_dict={}\n",
38+
"\n",
39+
"for label_col in range(len(labels)):\n",
40+
" y_true_label = y_true[:, label_col]\n",
41+
" y_pred_label = y_pred[:, label_col]\n",
42+
" conf_mat_dict[labels[label_col]] = confusion_matrix(y_pred=y_pred_label, y_true=y_true_label)\n",
43+
"\n",
44+
"\n",
45+
"for label, matrix in conf_mat_dict.items():\n",
46+
" print(\"Confusion matrix for label {}:\".format(label))\n",
47+
" print(matrix)"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": []
56+
}
57+
],
58+
"metadata": {
59+
"kernelspec": {
60+
"display_name": "Python 3",
61+
"language": "python",
62+
"name": "python3"
63+
},
64+
"language_info": {
65+
"codemirror_mode": {
66+
"name": "ipython",
67+
"version": 3
68+
},
69+
"file_extension": ".py",
70+
"mimetype": "text/x-python",
71+
"name": "python",
72+
"nbconvert_exporter": "python",
73+
"pygments_lexer": "ipython3",
74+
"version": "3.6.8"
75+
}
76+
},
77+
"nbformat": 4,
78+
"nbformat_minor": 2
79+
}

0 commit comments

Comments
 (0)