kumar-shridhar
diff --git a/‎.gitignore
Lines changed: 5 additions & 0 deletions b/‎.gitignore
Lines changed: 5 additions & 0 deletions
diff --git a/‎.ipynb_checkpoints/Multiclass and multi lables-checkpoint.ipynb
Lines changed: 6 additions & 0 deletions b/‎.ipynb_checkpoints/Multiclass and multi lables-checkpoint.ipynb
Lines changed: 6 additions & 0 deletions
diff --git a/‎.ipynb_checkpoints/SemHash_V_HDvec-checkpoint.ipynb
Lines changed: 67154 additions & 0 deletions b/‎.ipynb_checkpoints/SemHash_V_HDvec-checkpoint.ipynb
Lines changed: 67154 additions & 0 deletions
diff --git a/‎HD_ngrams.py
Lines changed: 40 additions & 0 deletions b/‎HD_ngrams.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎HD_ngrams_binary.py
Lines changed: 39 additions & 0 deletions b/‎HD_ngrams_binary.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎HD_ngrams_mod.py
Lines changed: 90 additions & 0 deletions b/‎HD_ngrams_mod.py
Lines changed: 90 additions & 0 deletions
diff --git a/‎Multiclass and multi lables.ipynb
Lines changed: 79 additions & 0 deletions b/‎Multiclass and multi lables.ipynb
Lines changed: 79 additions & 0 deletions
@@ -0,0 +1,5 @@
+
+*.csv
+updated_semhash_pipeline.ipynb
+semhash_pipeline.ipynb
+*.csv
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 22 19:13:43 2019
+
+@author: denkle
+"""
+import numpy as np
+
+def ngram_encode(str, HD_aphabet, aphabet, n_size): # method for mapping n-gram statistics of a word to an N-dimensional HD vector
+    HD_ngram = np.zeros(HD_aphabet.shape[1]) # will store n-gram statistics mapped to HD vector
+    full_str = '#' + str + '#' # include extra symbols to the string
+    
+    #adjust the string for n-gram size
+    if n_size == 1:
+        full_str_e=full_str            
+    else:
+        full_str_e=full_str[:-(n_size-1)]    
+        
+    for il, l in enumerate(full_str_e): # loops through all n-grams
+        hdgram = HD_aphabet[aphabet.find(full_str[il]), :] # picks HD vector for the first symbol in the current n-gram
+        
+        for ng in range(1, n_size): #loops through the rest of symbols in the current n-gram
+                hdgram = hdgram * np.roll(HD_aphabet[aphabet.find(full_str[il+ng]), :], ng) # two operations simultaneously; binding via elementvise multiplication; rotation via cyclic shift
+
+        HD_ngram += hdgram # increments HD vector of n-gram statistics with the HD vector for the currently observed n-gram
+
+    HD_ngram_norm = np.sqrt(HD_aphabet.shape[1]) * (HD_ngram/ np.linalg.norm(HD_ngram) )  # normalizes HD-vector so that its norm equals sqrt(N)       
+    return HD_ngram_norm # output normalized HD mapping
+
+
+
+N = 1000 # set the desired dimensionality of HD vectors
+n_size=3 # n-gram size
+aphabet = 'abcdefghijklmnopqrstuvwxyz#' #fix the alphabet. Note, we assume that capital letters are not in use 
+np.random.seed(1) # for reproducibility
+HD_aphabet = 2 * (np.random.randn(len(aphabet), N) < 0) - 1 # generates bipolar {-1, +1}^N HD vectors; one random HD vector per symbol in the alphabet
+
+str='jump' # example string to represent using n-grams
+HD_ngram = ngram_encode(str, HD_aphabet, aphabet, n_size) # HD_ngram is a projection of n-gram statistics for str to N-dimensional space. It can be used to learn the word embedding
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 22 19:13:43 2019
+
+@author: denkle
+"""
+import numpy as np
+
+def ngram_encode_bin(str, HD_aphabet, aphabet, n_size): # method for mapping n-gram statistics of a word to an N-dimensional HD vector
+    HD_ngram = np.zeros(HD_aphabet.shape[1]) # will store n-gram statistics mapped to HD vector
+    full_str = '#' + str + '#' # include extra symbols to the string
+    
+    #adjust the string for n-gram size
+    
+    if n_size == 1:
+        full_str_e=full_str            
+    else:
+        full_str_e=full_str[:-(n_size-1)]    
+        
+    for il, l in enumerate(full_str_e): # loops through all n-grams
+        hdgram = HD_aphabet[aphabet.find(full_str[il]), :] # picks HD vector for the first symbol in the current n-gram
+        
+        for ng in range(1, n_size): #loops through the rest of symbols in the current n-gram
+                hdgram = np.logical_xor( hdgram, np.roll(HD_aphabet[aphabet.find(full_str[il+ng]), :], ng)) # two operations simultaneously; binding via XOR; rotation via cyclic shift
+
+        HD_ngram += hdgram # increments HD vector of n-gram statistics with the HD vector for the currently observed n-gram
+
+    HD_ngram_norm =HD_ngram/len(full_str_e)  # normalizes HD-vector by number of n-grams. So value in each position is between 0 and 1    
+    return HD_ngram_norm # output normalized HD mapping
+
+N = 1000 # set the desired dimensionality of HD vectors
+n_size=2 # n-gram size
+aphabet = 'abcdefghijklmnopqrstuvwxyz#' #fix the alphabet. Note, we assume that capital letters are not in use 
+np.random.seed(1) # for reproducibility
+HD_aphabet = (np.random.randn(len(aphabet), N) < 0)  # generates binary {0, 1}^N HD vectors; one random HD vector per symbol in the alphabet
+
+str='jump' # example string to represent using n-grams
+HD_ngram = ngram_encode_bin(str, HD_aphabet, aphabet, n_size) # HD_ngram is a projection of n-gram statistics for str to N-dimensional space. It can be used to learn the word embedding
@@ -0,0 +1,79 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion matrix for label B:\n",
+      "[[2 2]\n",
+      " [2 1]]\n",
+      "Confusion matrix for label T:\n",
+      "[[1 2]\n",
+      " [2 2]]\n",
+      "Confusion matrix for label D:\n",
+      "[[2 1]\n",
+      " [2 2]]\n",
+      "Confusion matrix for label C:\n",
+      "[[5 2]\n",
+      " [0 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "# from sklearn.metrics import multilabel_confusion_matrix\n",
+    "\n",
+    "y_true = np.array([[1,0,0, 0], [1,0,0, 0], [1,0,0, 0], [0,1,1,0], [0,1,1,0], [0,1,1,0], [0,1,1,0]])\n",
+    "y_pred = np.array([[1,0,0, 0], [0,1,0, 0], [0,1,1, 0], [1,0,1,0], [1,0,0,1], [0,1,0,1], [0,1,1,0]])\n",
+    "\n",
+    "labels = [\"B\", \"T\", \"D\", \"C\"]\n",
+    "\n",
+    "conf_mat_dict={}\n",
+    "\n",
+    "for label_col in range(len(labels)):\n",
+    "    y_true_label = y_true[:, label_col]\n",
+    "    y_pred_label = y_pred[:, label_col]\n",
+    "    conf_mat_dict[labels[label_col]] = confusion_matrix(y_pred=y_pred_label, y_true=y_true_label)\n",
+    "\n",
+    "\n",
+    "for label, matrix in conf_mat_dict.items():\n",
+    "    print(\"Confusion matrix for label {}:\".format(label))\n",
+    "    print(matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
++
 +*.csv
 +updated_semhash_pipeline.ipynb
 +semhash_pipeline.ipynb
 +*.csv