Skip to content

Commit

Permalink
Transcript semantic parsing (#6852)
Browse files Browse the repository at this point in the history
  • Loading branch information
tonytamsf committed Feb 4, 2024
1 parent fc69642 commit d3120ee
Show file tree
Hide file tree
Showing 11 changed files with 479 additions and 0 deletions.
Expand Up @@ -49,6 +49,7 @@ public class FeedItem extends FeedComponent implements Serializable {
private String podcastIndexTranscriptUrl;
private String podcastIndexTranscriptType;
private String podcastIndexTranscriptText;
private Transcript transcript;

private int state;
public static final int NEW = -1;
Expand Down Expand Up @@ -494,6 +495,14 @@ public void updateTranscriptPreferredFormat(String type, String url) {
}
}

public Transcript getTranscript() {
return transcript;
}

public void setTranscript(Transcript t) {
transcript = t;
}

public String getPodcastIndexTranscriptText() {
return podcastIndexTranscriptText;
}
Expand Down
@@ -0,0 +1,28 @@
package de.danoeh.antennapod.model.feed;

import java.util.Map;
import java.util.TreeMap;

public class Transcript {

private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();

public void addSegment(TranscriptSegment segment) {
segmentsMap.put(segment.getStartTime(), segment);
}

public TranscriptSegment getSegmentAtTime(long time) {
if (segmentsMap.floorEntry(time) == null) {
return null;
}
return segmentsMap.floorEntry(time).getValue();
}

public int getSegmentCount() {
return segmentsMap.size();
}

public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
return segmentsMap.ceilingEntry(time);
}
}
@@ -0,0 +1,31 @@
package de.danoeh.antennapod.model.feed;

public class TranscriptSegment {
private final long startTime;
private final long endTime;
private final String words;
private final String speaker;

public TranscriptSegment(long start, long end, String w, String s) {
startTime = start;
endTime = end;
words = w;
speaker = s;
}

public long getStartTime() {
return startTime;
}

public long getEndTime() {
return endTime;
}

public String getWords() {
return words;
}

public String getSpeaker() {
return speaker;
}
}
3 changes: 3 additions & 0 deletions parser/transcript/README.md
@@ -0,0 +1,3 @@
# :parser:transcript

This module provides parsing for transcripts
23 changes: 23 additions & 0 deletions parser/transcript/build.gradle
@@ -0,0 +1,23 @@
plugins {
id("com.android.library")
}
apply from: "../../common.gradle"

android {
namespace "de.danoeh.antennapod.parser.transcript"
}

dependencies {
implementation project(':model')

annotationProcessor "androidx.annotation:annotation:$annotationVersion"

implementation "androidx.core:core:$coreVersion"

implementation "org.apache.commons:commons-lang3:$commonslangVersion"
implementation "commons-io:commons-io:$commonsioVersion"
implementation "org.jsoup:jsoup:$jsoupVersion"

testImplementation "junit:junit:$junitVersion"
testImplementation "org.robolectric:robolectric:$robolectricVersion"
}
@@ -0,0 +1,65 @@
package de.danoeh.antennapod.parser.transcript;

import org.apache.commons.lang3.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.internal.StringUtil;

import de.danoeh.antennapod.model.feed.Transcript;
import de.danoeh.antennapod.model.feed.TranscriptSegment;

public class JsonTranscriptParser {
public static Transcript parse(String jsonStr) {
try {
Transcript transcript = new Transcript();
long startTime = -1L;
long endTime = -1L;
long segmentStartTime = -1L;
long duration = 0L;
String speaker = "";
String segmentBody = "";
JSONObject obj = new JSONObject(jsonStr);
JSONArray objSegments = obj.getJSONArray("segments");

for (int i = 0; i < objSegments.length(); i++) {
JSONObject jsonObject = objSegments.getJSONObject(i);
startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
if (startTime < 0 || endTime < 0) {
continue;
}
if (segmentStartTime == -1L) {
segmentStartTime = startTime;
}
duration += endTime - startTime;

speaker = jsonObject.optString("speaker");
String body = jsonObject.optString("body");
segmentBody += body + " ";

if (duration >= TranscriptParser.MIN_SPAN) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
duration = 0L;
segmentBody = "";
segmentStartTime = -1L;
}
}

if (!StringUtil.isBlank(segmentBody)) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
}

if (transcript.getSegmentCount() > 0) {
return transcript;
} else {
return null;
}

} catch (org.json.JSONException e) {
e.printStackTrace();
}
return null;
}
}
@@ -0,0 +1,118 @@
package de.danoeh.antennapod.parser.transcript;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.internal.StringUtil;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.danoeh.antennapod.model.feed.Transcript;
import de.danoeh.antennapod.model.feed.TranscriptSegment;

public class SrtTranscriptParser {
private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");

public static Transcript parse(String str) {
if (StringUtils.isBlank(str)) {
return null;
}
str = str.replaceAll("\r\n", "\n");

Transcript transcript = new Transcript();
List<String> lines = Arrays.asList(str.split("\n"));
Iterator<String> iter = lines.iterator();
String speaker = "";
StringBuilder body = new StringBuilder();
String line;
String segmentBody = "";
long startTimecode = -1L;
long spanStartTimecode = -1L;
long endTimecode = -1L;
long duration = 0L;

while (iter.hasNext()) {
line = iter.next();

if (line.isEmpty()) {
continue;
}

if (line.contains("-->")) {
String[] timecodes = line.split("-->");
if (timecodes.length < 2) {
continue;
}
startTimecode = parseTimecode(timecodes[0].trim());
endTimecode = parseTimecode(timecodes[1].trim());
if (startTimecode == -1 || endTimecode == -1) {
continue;
}

if (spanStartTimecode == -1) {
spanStartTimecode = startTimecode;
}
duration += endTimecode - startTimecode;
do {
line = iter.next();
if (StringUtil.isBlank(line)) {
break;
}
body.append(line.strip());
body.append(" ");
} while (iter.hasNext());
}

if (body.indexOf(":") != -1) {
String [] parts = body.toString().trim().split(":");
if (parts.length < 2) {
continue;
}
speaker = parts[0];
body = new StringBuilder(parts[1].strip());
}
if (!StringUtil.isBlank(body.toString())) {
segmentBody += " " + body;
segmentBody = StringUtils.trim(segmentBody);
if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
endTimecode,
segmentBody,
speaker));
duration = 0L;
spanStartTimecode = -1L;
segmentBody = "";
}
body = new StringBuilder();
}
}

if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
endTimecode,
segmentBody,
speaker));
}
if (transcript.getSegmentCount() > 0) {
return transcript;
} else {
return null;
}
}

// Time format 00:00:00,000
static long parseTimecode(String timecode) {
Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
if (!matcher.matches()) {
return -1;
}
long hours = Integer.parseInt(matcher.group(1));
long minutes = Integer.parseInt(matcher.group(2));
long seconds = Integer.parseInt(matcher.group(3));
long milliseconds = Integer.parseInt(matcher.group(4));
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
}
}
@@ -0,0 +1,24 @@
package de.danoeh.antennapod.parser.transcript;

import org.apache.commons.lang3.StringUtils;

import de.danoeh.antennapod.model.feed.Transcript;

public class TranscriptParser {
static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second

public static Transcript parse(String str, String type) {
if (str == null || StringUtils.isBlank(str)) {
return null;
}

if ("application/json".equals(type)) {
return JsonTranscriptParser.parse(str);
}

if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
return SrtTranscriptParser.parse(str);
}
return null;
}
}

0 comments on commit d3120ee

Please sign in to comment.