Skip to content

Commit

Permalink
Add chomsky_normal_form for CFGs
Browse files Browse the repository at this point in the history
  • Loading branch information
virresh committed Mar 31, 2019
1 parent bd802d5 commit 625b831
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@
- Nat Quayle Nelson <https://github.com/nqnstudios>
- Matan Rak <https://github.com/matanrak>
- Uday Krishna <https://github.com/udaykrishna>
- Viresh Gupta <https://github.com/virresh>

## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:
### Contributors to the Porter Stemmer
Expand Down
107 changes: 107 additions & 0 deletions nltk/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,113 @@ def is_chomsky_normal_form(self):
"""
return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical

def chomsky_normal_form(self, new_token_padding='@$@', flexible=False):
"""
Returns a new Grammer that is in chomsky normal
:param: new_token_padding
Customise new rule formation during binarisation
"""
if self.is_chomsky_normal_form():
return
if self.productions(empty=True):
raise ValueError(('Grammar has Empty rules. '
'Cannot deal with them at the moment'))

# check for mixed rules
for rule in self.productions():
if rule.is_lexical() and len(rule.rhs()) > 1:
raise ValueError(
'Cannot handled mixed rule {} => {}'.format(rule.lhs(),
rule.rhs()))

step1 = CFG.eliminate_start(self)
step2 = CFG.binarize(step1, new_token_padding)
if flexible:
return step2
step3 = CFG.remove_unitary_rules(step2)
return step3

@classmethod
def remove_unitary_rules(cls, grammar):
"""
Remove nonlexical unitary rules and convert them to
lexical
"""
result = []
unitary = []
for rule in grammar.productions():
if len(rule) == 1 and rule.is_nonlexical():
unitary.append(rule)
else:
result.append(rule)

while unitary:
rule = unitary.pop(0)
for item in grammar.productions(lhs=rule.rhs()[0]):
new_rule = Production(rule.lhs(), item.rhs())
if len(new_rule) != 1 or new_rule.is_lexical():
result.append(new_rule)
else:
unitary.append(new_rule)

n_grammar = CFG(grammar.start(), result)
return n_grammar

@classmethod
def binarize(cls, grammar, padding='@$@'):
"""
Convert all non-binary rules into binary by introducing
new tokens.
Example::
Original:
A => B C D
After Conversion:
A => B A@$@B
A@$@B => C D
"""
result = []

for rule in grammar.productions():
if len(rule.rhs()) > 2:
# this rule needs to be broken down
left_side = rule.lhs()
for k in range(0, len(rule.rhs()) - 2):
tsym = rule.rhs()[k]
new_sym = Nonterminal(
left_side.symbol() + padding + tsym.symbol()
)
new_production = Production(left_side, (tsym, new_sym))
left_side = new_sym
result.append(new_production)
last_prd = Production(left_side, rule.rhs()[-2:])
result.append(last_prd)
else:
result.append(rule)

n_grammar = CFG(grammar.start(), result)
return n_grammar

@classmethod
def eliminate_start(cls, grammar):
"""
Eliminate start rule in case it appears on RHS
Example: S -> S0 S1 and S0 -> S1 S
Then another rule S0_Sigma -> S is added
"""
start = grammar.start()
result = []
need_to_add = None
for rule in grammar.productions():
if start in rule.rhs():
need_to_add = True
result.append(rule)
if need_to_add:
start = Nonterminal('S0_SIGMA')
result.append(Production(start, grammar.start()))
n_grammar = CFG(start, result)
return n_grammar
return grammar

def __repr__(self):
return '<Grammar with %d productions>' % len(self._productions)

Expand Down

0 comments on commit 625b831

Please sign in to comment.