Skip to content

Commit

Permalink
Improve LogosNestedHyperlinkPostprocessor
Browse files Browse the repository at this point in the history
Turns out that when the document contains complex scripts, text ranges
in the Word document may get split across same formatting. The
LogosNestedHyperlinkPostprocessor depends on same formatting being in
only one region (for the hyperlink regions). Therefore, search for such
regions and merge them (showing a warning) before converting.

This should fix issues with Burmese bibles.
  • Loading branch information
schierlm committed Dec 20, 2023
1 parent 9a62486 commit 0a900bf
Showing 1 changed file with 33 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,39 @@ public Iterator<String> getPrefixes(String uri) {
String mostCommonStyle = styleNames.stream().collect(Collectors.groupingBy(s -> s, Collectors.counting()))
.entrySet().stream().max(Comparator.comparing(Entry::getValue)).map(Entry::getKey).orElse(null);
NodeList strikes = (NodeList) xp.evaluate("//w:strike", doc, XPathConstants.NODESET);
// when Bible text uses complex script, strikes may span multiple ranges - combine them first
int fixupCount = 0;
for (int i = strikes.getLength() - 1; i >= 1; i--) {
Node strike = strikes.item(i);
Node prevStrike = strikes.item(i - 1);
if (!prevStrike.getParentNode().getParentNode().getNextSibling().isSameNode(strike.getParentNode().getParentNode()))
continue;
if (strike.getPreviousSibling() != null)
continue;
if (strike.getNextSibling() != null && !strike.getNextSibling().getNodeName().equals("w:color"))
continue;
if (strike.getNextSibling() != null && strike.getNextSibling().getNextSibling() != null)
continue;
if (prevStrike.getPreviousSibling() != null)
continue;
if (prevStrike.getNextSibling() != null && !strike.getNextSibling().getNodeName().equals("w:color"))
continue;
if (prevStrike.getNextSibling() != null && strike.getNextSibling().getNextSibling() != null)
continue;
Node prevText = prevStrike.getParentNode().getNextSibling().getFirstChild();
if (!(prevText instanceof Text) || prevText.getParentNode().getAttributes().getLength() != 0)
continue;
Node text = strike.getParentNode().getNextSibling().getFirstChild();
if (!(text instanceof Text) || text.getParentNode().getAttributes().getLength() != 0)
continue;
prevText.setNodeValue(prevText.getNodeValue() + text.getNodeValue());
strike.getParentNode().getParentNode().getParentNode().removeChild(strike.getParentNode().getParentNode());
fixupCount++;
}
if (fixupCount > 0) {
System.out.println("WARNING: Merged " + fixupCount + " adjacent hyperlink regions; this should only happen if the document uses complex scripts");
}
strikes = (NodeList) xp.evaluate("//w:strike", doc, XPathConstants.NODESET);
for (int i = 0; i < strikes.getLength(); i++) {
Node strike = strikes.item(i);
if (strike.getPreviousSibling() != null)
Expand Down

0 comments on commit 0a900bf

Please sign in to comment.