Skip to content

Commit

Permalink
Add WIVU Morphology support
Browse files Browse the repository at this point in the history
  • Loading branch information
schierlm committed Mar 12, 2024
1 parent 9af1065 commit 7622542
Show file tree
Hide file tree
Showing 20 changed files with 496 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -898,8 +898,15 @@ else if (expandedStrongs[i].charAt(0) == 'H')
}
}
if (rmac != null && i < rmac.length) {
for(String morph : convertMorphology(rmac[i]).split(":")) {
links.add("LogosMorphGr:" + morph);
if (rmac[i].matches(Utils.RMAC_REGEX)) {
for(String morph : convertMorphology(rmac[i]).split(":")) {
links.add("LogosMorphGr:" + morph);
}
} else if (rmac[i].matches(Utils.WIVU_REGEX)) {
String prefix = rmac[i].startsWith("A") ? "WIVUMorphAram:" : "WIVUMorphHeb:";
for(String morph : rmac[i].substring(1).split("/+")) {
links.add(prefix+morph);
}
}
}
}
Expand Down Expand Up @@ -1036,7 +1043,7 @@ public static ExtraLinkCondition parse(String[] conditions) {
verseNumber = new Versification.Reference(BookID.fromOsisId(parts[0]), Integer.parseInt(parts[1]), parts[2]);
} else if (cond.matches("[A-Z][1-9][0-9]*")) {
strongNumbers.add(cond);
} else if (cond.matches(Utils.RMAC_REGEX)) {
} else if (cond.matches(Utils.MORPH_REGEX)) {
rmacNumbers.add(cond);
} else {
throw new IllegalArgumentException("Unsupported condition format");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ public void testPatternValidity() {
Assert.assertTrue("Mismatch: " + pattern, pattern.matches("([A-Z1-3-]++|\\[[A-Z1-3]++\\])*+"));
for (String expansion : expandPattern(pattern)) {
Assert.assertTrue("Not accepted: " + expansion, expansion.matches(Utils.RMAC_REGEX));
Assert.assertFalse("RMAC may not be WIVU: " + expansion, expansion.matches(Utils.WIVU_REGEX));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,25 @@ public void testPatternCoverage() {
Assert.assertEquals(expandedPatterns, expectedPatterns);
}

private Set<String> expandToPatterns(String regex) {
protected static Set<String> expandToPatterns(String regex) {
ParsePosition p = new ParsePosition(0);
Set<String> result = expandList(regex, p);
if (p.getIndex() != regex.length())
throw new RuntimeException("Unsupported character in regex: " + regex.substring(p.getIndex()));
return result;
}

private Set<String> expandList(String regex, ParsePosition p) {
private static Set<String> expandList(String regex, ParsePosition p) {
Set<String> result = new HashSet<>();
while (p.getIndex() != regex.length() && regex.charAt(p.getIndex()) != ')') {
result.addAll(expandAlternative(regex, p));
}
return result;
}

private Set<String> expandAlternative(String regex, ParsePosition pos) {
private static Set<String> expandAlternative(String regex, ParsePosition pos) {
List<Set<String>> parts = new ArrayList<>();
String lookAt = regex.substring(pos.getIndex());
int p = pos.getIndex();
while (p < regex.length()) {
if (regex.startsWith("[", p)) {
Expand Down Expand Up @@ -75,7 +76,7 @@ private Set<String> expandAlternative(String regex, ParsePosition pos) {
break;
} else {
int ps = p;
while (regex.charAt(p) == '-' || (regex.charAt(p) >= 'A' && regex.charAt(p) <= 'Z') || (regex.charAt(p) >= '0' && regex.charAt(p) <= '9')) {
while (regex.charAt(p) == '-' || regex.charAt(p) == '/' || (regex.charAt(p) >= 'A' && regex.charAt(p) <= 'Z') || (regex.charAt(p) >= 'a' && regex.charAt(p) <= 'z') || (regex.charAt(p) >= '0' && regex.charAt(p) <= '9')) {
p++;
}
if (p == ps) {
Expand All @@ -98,6 +99,6 @@ private Set<String> expandAlternative(String regex, ParsePosition pos) {
}
parts.set(0, j);
}
return parts.get(0);
return parts.isEmpty() ? new HashSet<>(Arrays.asList("")) : parts.get(0);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package biblemulticonverter.logos.format;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import org.junit.Assert;
import org.junit.Test;

import biblemulticonverter.data.Utils;

/**
* Test that all WIVU patterns are valid and can be handled by RoundtripHTML.
*/
public class WIVUConversionTest {

private static final boolean RUN_ALL_PATTERNS = Boolean.getBoolean("biblemulticonverter.unittest.runallpatterns");

private static final String[] PATTERN_PARTS = {
"B=C",
"B=D",
"B=T",
"B=R",
"B=R[d]",
"B=A[acgo]",
"B=A[acgo][bcfmx][dpsx][acd]",
"B=N[cgtx]",
"B=N[cgtx][bcfmx][dpsx][acd]",
"B=Np",
"B=Np[mflt]",
"B=P[dfipr]",
"B=P[dfipr][123x][bcfm][dps]",
"B=S[dhnp]",
"B=S[dhnp][123x][bcfm][dps]",
"B=T[acdeijmnor]",
"H=V[DHKLMNOPQcfhijklmopqrtuvwyz][pqiwhjvrsauc]",
"A=V[GHLMOPQabcefhilmopqrstuvwz][pqiwhjvrsauc]",
"H=V[DHKLMNOPQcfhijklmopqrtuvwyz][pqiwhjvrsauc][123][bcfm][dps]",
"A=V[GHLMOPQabcefhilmopqrstuvwz][pqiwhjvrsauc][123][bcfm][dps]",
"H=V[DHKLMNOPQcfhijklmopqrtuvwyz][pqiwhjvrsauc][ac]",
"A=V[GHLMOPQabcefhilmopqrstuvwz][pqiwhjvrsauc][ac]",
"H=V[DHKLMNOPQcfhijklmopqrtuvwyz][pqiwhjvrsauc][bfm]",
"A=V[GHLMOPQabcefhilmopqrstuvwz][pqiwhjvrsauc][bfm]",
"H=V[DHKLMNOPQcfhijklmopqrtuvwyz][pqiwhjvrsauc][bcfm][dps][acd]",
"A=V[GHLMOPQabcefhilmopqrstuvwz][pqiwhjvrsauc][bcfm][dps][acd]",
};

protected static List<String> computePatterns() {
List<String> result = new ArrayList<>();
List<String> partSample = Arrays.stream(PATTERN_PARTS).filter(p -> p.length() < 10).collect(Collectors.toList());

for (String part : PATTERN_PARTS) {
// length 1
if (isValid("H", part)) {
result.add("H" + part.substring(2));
}
if (isValid("A", part)) {
result.add("A" + part.substring(2));
}
if (RUN_ALL_PATTERNS) {
// length 2
for (String part2 : partSample) {
if (isValid("H", part) && isValid("A", part)) {
result.add("H" + part.substring(2) + "/" + part2.substring(2));
}
if (isValid("A", part) && isValid("H", part)) {
result.add("A" + part2.substring(2) + "/" + part.substring(2));
}
}
// a sample of longer ones
if (isValid("H", part)) {
result.add("HC//" + part.substring(2) + "/D");
result.add("H" + part.substring(2) + "/Nt/D/C");
}
if (isValid("A", part)) {
result.add("ARd/R/" + part.substring(2));
}
} else if (isValid("H", part)) {
result.add("HC//" + part.substring(2));
}
}
return result;
}

private static boolean isValid(String lang, String part) {
return part.startsWith("B=") || part.startsWith(lang + "=");
}

protected static List<String> expandPattern(String pattern) {
int pos = pattern.indexOf('[');
List<String> result = new ArrayList<>();
if (pos == -1) {
result.add(pattern);
} else if (pattern.startsWith("[[", pos)) {
int endPos = pattern.indexOf("]]", pos);
for (String suffix : expandPattern(pattern.substring(endPos + 2))) {
for (int i = pos + 2; i < endPos; i += 2) {
result.add(pattern.substring(0, pos) + pattern.substring(i, i + 2) + suffix);
}
}
} else {
int endPos = pattern.indexOf(']', pos);
for (String suffix : expandPattern(pattern.substring(endPos + 1))) {
for (int i = pos + 1; i < endPos; i++) {
result.add(pattern.substring(0, pos) + pattern.charAt(i) + suffix);
}
}
}
return result;
}

/**
* Test that all WIVU morphology tags created from the patterns are accepted
* by the regular expression.
*/
@Test
public void testPatternValidity() {
for (String pattern : computePatterns()) {
Assert.assertTrue("Mismatch: " + pattern, pattern.matches("([A-Za-z1-3/]++|\\[[A-Za-z1-3]++\\])*+"));
for (String expansion : expandPattern(pattern)) {
Assert.assertTrue("Not accepted: " + expansion, expansion.matches(Utils.WIVU_REGEX));
Assert.assertFalse("WIVU may not be RMAC: " + expansion, expansion.matches(Utils.RMAC_REGEX));
}
}
}

/**
* Test that all WIVU morpology tags that are accepted by the regular
* expression are covered by the patterns
*/
@Test
public void testPatternCoverage() {
String regex = Utils.WIVU_PART_REGEX;
List<String> patternParts = new ArrayList<>(Arrays.asList(PATTERN_PARTS));
for (int i = 0; i < patternParts.size() - 1; i++) {
String p1 = patternParts.get(i);
String p2 = patternParts.get(i + 1);
if (p1.startsWith("H=") && p2.startsWith("A=")) {
int pos0 = p1.indexOf('[');
int pos1 = p1.indexOf(']');
int pos2 = p2.indexOf(']');
if (p1.substring(pos1).equals(p2.substring(pos2)) && p1.substring(1, pos0 + 1).equals(p2.substring(1, pos0 + 1))) {
String range1 = p1.substring(pos0 + 1, pos1);
String range2 = p2.substring(pos0 + 1, pos2);
StringBuilder rangeB = new StringBuilder(range1.length() + range2.length());
int i1 = 0, i2 = 0;
while (i1 < range1.length() && i2 < range2.length()) {
if (range1.charAt(i1) == range2.charAt(i2)) {
rangeB.append(range1.charAt(i1)); i1++; i2++;
} else if (range1.charAt(i1) < range2.charAt(i2)) {
rangeB.append(range1.charAt(i1)); i1++;
} else if (range1.charAt(i1) > range2.charAt(i2)) {
rangeB.append(range2.charAt(i2)); i2++;
}
}
rangeB.append(range1.substring(i1) + range2.substring(i2)); // one is empty
String pb = "B=" + p1.substring(2, pos0 + 1) + rangeB.toString() + p1.substring(pos1);
patternParts.set(i, pb);
patternParts.set(i + 1, pb);
}
}
}
for (int i = 0; i < patternParts.size(); i++) {
Assert.assertTrue("Starts with B=: "+patternParts.get(i), patternParts.get(i).startsWith("B="));
patternParts.set(i, patternParts.get(i).substring(2));
}
Set<String> expandedPatterns = RMACPatternCompletenessTest.expandToPatterns(regex);
Set<String> expectedPatterns = new HashSet<>(patternParts);
Assert.assertEquals(expandedPatterns, expectedPatterns);
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package biblemulticonverter.logos.format;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;

/**
* Generate a HTML page that validates that the RoundtripHTML script can handle
* all possible WIVU morphology tags.
*/
public class WIVURoundtripHTMLTestGenerator {
// TODO
public static void main(String[] args) throws Exception {
try(BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("wivutest.html"), StandardCharsets.UTF_8))) {
bw.write("<html>\n<script type=\"text/javascript\" src=\"script.js\"></script>\n<script type=\"text/javascript\">\n");
bw.write("let choices = [");
for(String pattern: WIVUConversionTest.computePatterns()) {
for(String expansion : WIVUConversionTest.expandPattern(pattern)) {
bw.write("'"+expansion+"',");
}
}
bw.write("];\n");
bw.write("window.onload = function() {\n");
bw.write("\tlet errors = \"\", log = \"\", seen = {};\n\tfor (let wivu of choices) { \n\t\tlet x;\n");
bw.write("\t\ttry {\n\t\t\tx = renderWIVU(wivu);\n\t\t} catch (e) {\n\t\t\terrors +=\"<br>\"+wivu+\" results in error: \"+e;\n");
bw.write("\t\t\tcontinue;\n\t\t}\n\t\tif (seen[x]) {\n\t\t\terrors += \"<br>Both \"+seen[x]+\" and \"+wivu+\" render to the same value: \"+x;\n");
bw.write("\t\t} else if (x == wivu || x.endsWith(wivu.substring(1)) || x.indexOf(\"undefined\") != -1) {\n\t\t\terrors += \"<br>\"+wivu + \" renders as \" + x;\n\t\t} else {\n");
bw.write("\t\t\tlog +=\"<br>\"+wivu+\" renders as \" + x;\n\t\t}\n\t\tseen[x] = wivu;\n\t}\n");
bw.write("\tdocument.getElementById(\"errors\").innerHTML = errors; document.getElementById(\"log\").innerHTML = log;\n};\n");
bw.write("</script><body><h1>WIVU Test</h1><div id=\"errors\">(errors)</div><h2>Log</h2><div id=\"log\"></div></body></html>");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,12 @@
<simpleType name="RMACType">
<annotation>
<documentation>
Format of a valid Robinson's Morphological Analysis Code. Some formats will use a different representation,
Format of a valid Robinson's Morphological Analysis Code or WIVU Hebrew Morphology Code. Some formats will use a different representation,
so to make conversion work without errors, the allowed pattern is restricted here.
</documentation>
</annotation>
<restriction base="string">
<pattern value="(ADV|CONJ|COND|PRT|PREP|INJ|ARAM|HEB|N-PRI|A-NUI|N-LI|N-OI|[NARCDTKIXQFSP](-[123]?[NVGDA][SP][MFN]?)?)(-(S|C|ABB|I|N|K|ATT))?|V-([PIFARLX]|2[FARL])[AMPEDON][ISOMNP](-([123][SP]|[NGDAV][SPD][MFN]))?(-ATT)?"></pattern>
<pattern value="(ADV|CONJ|COND|PRT|PREP|INJ|ARAM|HEB|N-PRI|A-NUI|N-LI|N-OI|[NARCDTKIXQFSP](-[123]?[NVGDA][SP][MFN]?)?|[NA]-[NVGDA][SP][MFN]-([PLT]|[PL]G|LI)|A-[NVGDA][SP][MFN]-NUI|S-[123][SP][NVGDA][SP][MFN])(-(S|C|ABB|I|N|K|ATT|ARAM|HEB))?|V-([PIFARLX]|2[PFARL])[AMPEDONQX][ISOMNP](-([123][SP]|[NGDAV][SPD][MFN]))?(-ATT|-ARAM|-HEB)?|[HA](C|D|Np[mflt]?|(A[acgo]|N[cgtx])(|[bcfmx][dpsx][acd])|(P[dfipr]|S[dhnp])(|[123x][bcfm][dps])|R[d]?|T[acdeijmnor]?|V[DGHKLMNOPQabcefhijklmopqrstuvwyz][pqiwhjvrsauc](|[bfm]|[123][bcfm][dps]|[bcfm][dps][acd]|[ac]))(//?(C|D|Np[mflt]?|(A[acgo]|N[cgtx])(|[bcfmx][dpsx][acd])|(P[dfipr]|S[dhnp])(|[123x][bcfm][dps])|R[d]?|T[acdeijmnor]?|V[DGHKLMNOPQabcefhijklmopqrstuvwyz][pqiwhjvrsauc](|[bfm]|[123][bcfm][dps]|[bcfm][dps][acd]|[ac])))*"></pattern>
</restriction>
</simpleType>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,8 @@ private String convertFromVerse(String text, Visitor<RuntimeException> vv, Hyper
pos = text.indexOf("</m>");
rmac = cleanText(text.substring(3, pos));
text = text.substring(pos + 4);
if (!Utils.compilePattern(Utils.RMAC_REGEX).matcher(rmac).matches()) {
System.out.println("WARNING: Skipping malformed RMAC morphology code: " + rmac);
if (!Utils.compilePattern(Utils.MORPH_REGEX).matcher(rmac).matches()) {
System.out.println("WARNING: Skipping malformed RMAC/WIVU morphology code: " + rmac);
rmac = null;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ private String convertFromVerse(String text, Visitor<RuntimeException> vv, boole
String[] rmacs = mtags.isEmpty() ? null : new String[mtags.size()];
for (int i = 0; i < mtags.size(); i++) {
rmacs[i] = mtags.get(i);
if (!Utils.compilePattern(Utils.RMAC_REGEX).matcher(rmacs[i]).matches()) {
System.out.println("WARNING: Skipping malformed RMAC morphology code: " + rmacs[i]);
if (!Utils.compilePattern(Utils.MORPH_REGEX).matcher(rmacs[i]).matches()) {
System.out.println("WARNING: Skipping malformed RMAC/WIVU morphology code: " + rmacs[i]);
rmacs = null;
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ private GrammarInformation(char[] strongsPrefixes, int[] strongs, String[] rmac,
if (rmac.length == 0)
throw new IllegalArgumentException("RMAC may not be empty");
for (String entry : rmac) {
Utils.validateString("rmac", entry, Utils.RMAC_REGEX);
Utils.validateString("morph", entry, Utils.MORPH_REGEX);
}
}
if (sourceIndices != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ public class Utils {
private static final String RMAC_VERBS = "V-([PIFARLX]|2[PFARL])[AMPEDONQX][ISOMNP](-([123][SP]|[NGDAV][SPD][MFN]))?";
public static final String RMAC_REGEX = "(" + RMAC_UNDECLINED + "|" + RMAC_DECLINED + ")(-(S|C|ABB|I|N|K|ATT|ARAM|HEB))?|" + RMAC_VERBS + "(-ATT|-ARAM|-HEB)?";

public static final String WIVU_PART_REGEX = "C|D|Np[mflt]?|(A[acgo]|N[cgtx])(|[bcfmx][dpsx][acd])|(P[dfipr]|S[dhnp])(|[123x][bcfm][dps])|R[d]?|T[acdeijmnor]?|V[DGHKLMNOPQabcefhijklmopqrstuvwyz][pqiwhjvrsauc](|[bfm]|[123][bcfm][dps]|[bcfm][dps][acd]|[ac])";
public static final String WIVU_REGEX = "[HA](" + WIVU_PART_REGEX + ")(//?(" + WIVU_PART_REGEX + "))*";

public static final String MORPH_REGEX = RMAC_REGEX + "|" + WIVU_REGEX;

public static int validateNumber(String name, int value, int min, int max) {
if (value < min || value > max)
throw new IllegalArgumentException(name + " is invalid: " + value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import biblemulticonverter.data.FormattedText.LineBreakKind;
import biblemulticonverter.data.FormattedText.RawHTMLMode;
import biblemulticonverter.data.FormattedText.Visitor;
import biblemulticonverter.data.Utils;
import biblemulticonverter.data.Verse;
import biblemulticonverter.data.VirtualVerse;

Expand Down Expand Up @@ -593,7 +594,7 @@ public Visitor<RuntimeException> visitGrammarInformation(char[] strongsPrefixes,
String suffix = "";
if (rmac != null && !suffixStack.get(suffixStack.size() - 1).startsWith("@")) {
for (String r : rmac) {
String morph = rmac2Morph(r);
String morph = r.matches(Utils.RMAC_REGEX) ? rmac2Morph(r) : ""; // WIVU unimplemented!
if (morph != null)
suffix += "@" + morph;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ private boolean parseContent(Visitor<RuntimeException> visitor, List<? extends O
if (rmacValue.endsWith("-"))
rmacValue = rmacValue.substring(0, rmacValue.length() - 1);
rmacList.set(i, rmacValue);
if (!rmacValue.matches(Utils.RMAC_REGEX)) {
System.out.println("WARNING: Skipping invalid RMAC: " + rmacValue);
if (!rmacValue.matches(Utils.MORPH_REGEX)) {
System.out.println("WARNING: Skipping invalid RMAC/WIVU: " + rmacValue);
rmacList.remove(i);
i--;
}
Expand Down

0 comments on commit 7622542

Please sign in to comment.