-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
116 lines (87 loc) · 3.66 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from scipy import stats
from typing import *
from functools import partial, reduce
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from functools import reduce
from torchtext.datasets import TranslationDataset
from torchtext.data import Field, BucketIterator
from abc import ABC
T = TypeVar('T')
class Sampleable(ABC, Generic[T]):
def __init__(self, choices: List[Tuple[T, float]]):
self.possible_choices = [x[0] for x in choices]
self.probs = [x[1] for x in choices]
def sample(self) -> T:
indicies = [x for x in range(len(self.possible_choices))]
# index storage avoids np.random.choice's inability to work with 2d lists
chosen_index = np.random.choice(indicies, 1, p=self.probs)[0]
return self.possible_choices[chosen_index]
Constraint = str
Ranking = List[Constraint]
class Segment():
def __init__(self, character: str):
self.char = character
from segments import phones, voiced, obstruents
assert(character in phones)
self.voiced = character in voiced
self.obstruent = character in obstruents
class PossibleRankings(Sampleable):
def __init__(self, choices: List[Tuple[Ranking, float]]):
super().__init__(choices)
class PossibleSegments(Sampleable):
def __init__(self, choices: List[Tuple[Segment, float]]):
super().__init__(choices)
def all_prop(self, prop: str) -> bool:
return reduce(
lambda acc, seg: acc and getattr(seg, prop),
self.possible_choices,
True)
def all_voiced(self) -> bool:
return self.all_prop('voiced')
def all_obstruent(self) -> bool:
return self.all_prop('obstruent')
class Word():
def __init__(self, segments: List[PossibleSegments]):
self.segments = segments
def gen_sample_output(self, min_number_of_examples: int, max_number_of_examples: int) -> List[str]:
"""This generates multiple intances of a word/input seperated by <sep>"""
src = ['<sos>']
number_of_outputs = random.Random(0).randint(
min_number_of_examples, max_number_of_examples)
for i in range(number_of_outputs):
for seg in self.segments:
src.append(seg.sample().char)
src.append('<sep>')
src[-1] = '<eos>'
return src
def gen_examples_for_word_and_rankings(
word: Word,
rankings: PossibleRankings,
min_per_pair: int,
max_per_pair: int, min_per_word: int, max_per_word: int) -> List[Tuple[List[str], Ranking]]:
# TODO maybe change from uniform random to dome dist weight toward larger
num_examples = random.Random(0).randint(min_per_pair, max_per_pair)
return [(word.gen_sample_output(min_per_word, max_per_word), rankings.sample())
for i in range(num_examples)]
def gen_all_examples(words_and_rankings: List[Tuple[Word,
PossibleRankings]],
min_per_pair,
max_per_pair, min_per_word, max_per_word) -> List[Tuple[List[str],
Ranking]]:
examples = []
for word, rankings in words_and_rankings:
examples += gen_examples_for_word_and_rankings(word,
rankings,
min_per_pair,
max_per_pair, min_per_word, max_per_word)
return examples
def seg(char: str) -> PossibleSegments:
return PossibleSegments([(Segment(char), 1.0)])
def single_ranking(ranking: Ranking) -> PossibleRankings:
return PossibleRankings([(ranking, 1.0)])
def u_seg(chars: Collection[str]) -> PossibleSegments:
return PossibleSegments([(Segment(char), 1 / len(chars)) for char in chars])