Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
479 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package de.danoeh.antennapod.model.feed; | ||
|
||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
public class Transcript { | ||
|
||
private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>(); | ||
|
||
public void addSegment(TranscriptSegment segment) { | ||
segmentsMap.put(segment.getStartTime(), segment); | ||
} | ||
|
||
public TranscriptSegment getSegmentAtTime(long time) { | ||
if (segmentsMap.floorEntry(time) == null) { | ||
return null; | ||
} | ||
return segmentsMap.floorEntry(time).getValue(); | ||
} | ||
|
||
public int getSegmentCount() { | ||
return segmentsMap.size(); | ||
} | ||
|
||
public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) { | ||
return segmentsMap.ceilingEntry(time); | ||
} | ||
} |
31 changes: 31 additions & 0 deletions
31
model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package de.danoeh.antennapod.model.feed; | ||
|
||
public class TranscriptSegment { | ||
private final long startTime; | ||
private final long endTime; | ||
private final String words; | ||
private final String speaker; | ||
|
||
public TranscriptSegment(long start, long end, String w, String s) { | ||
startTime = start; | ||
endTime = end; | ||
words = w; | ||
speaker = s; | ||
} | ||
|
||
public long getStartTime() { | ||
return startTime; | ||
} | ||
|
||
public long getEndTime() { | ||
return endTime; | ||
} | ||
|
||
public String getWords() { | ||
return words; | ||
} | ||
|
||
public String getSpeaker() { | ||
return speaker; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# :parser:transcript | ||
|
||
This module provides parsing for transcripts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
plugins { | ||
id("com.android.library") | ||
} | ||
apply from: "../../common.gradle" | ||
|
||
android { | ||
namespace "de.danoeh.antennapod.parser.transcript" | ||
} | ||
|
||
dependencies { | ||
implementation project(':model') | ||
|
||
annotationProcessor "androidx.annotation:annotation:$annotationVersion" | ||
|
||
implementation "androidx.core:core:$coreVersion" | ||
|
||
implementation "org.apache.commons:commons-lang3:$commonslangVersion" | ||
implementation "commons-io:commons-io:$commonsioVersion" | ||
implementation "org.jsoup:jsoup:$jsoupVersion" | ||
|
||
testImplementation "junit:junit:$junitVersion" | ||
testImplementation "org.robolectric:robolectric:$robolectricVersion" | ||
} |
65 changes: 65 additions & 0 deletions
65
...transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package de.danoeh.antennapod.parser.transcript; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.json.JSONArray; | ||
import org.json.JSONObject; | ||
import org.jsoup.internal.StringUtil; | ||
|
||
import de.danoeh.antennapod.model.feed.Transcript; | ||
import de.danoeh.antennapod.model.feed.TranscriptSegment; | ||
|
||
public class JsonTranscriptParser { | ||
public static Transcript parse(String jsonStr) { | ||
try { | ||
Transcript transcript = new Transcript(); | ||
long startTime = -1L; | ||
long endTime = -1L; | ||
long segmentStartTime = -1L; | ||
long duration = 0L; | ||
String speaker = ""; | ||
String segmentBody = ""; | ||
JSONObject obj = new JSONObject(jsonStr); | ||
JSONArray objSegments = obj.getJSONArray("segments"); | ||
|
||
for (int i = 0; i < objSegments.length(); i++) { | ||
JSONObject jsonObject = objSegments.getJSONObject(i); | ||
startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue(); | ||
endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue(); | ||
if (startTime < 0 || endTime < 0) { | ||
continue; | ||
} | ||
if (segmentStartTime == -1L) { | ||
segmentStartTime = startTime; | ||
} | ||
duration += endTime - startTime; | ||
|
||
speaker = jsonObject.optString("speaker"); | ||
String body = jsonObject.optString("body"); | ||
segmentBody += body + " "; | ||
|
||
if (duration >= TranscriptParser.MIN_SPAN) { | ||
segmentBody = StringUtils.trim(segmentBody); | ||
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker)); | ||
duration = 0L; | ||
segmentBody = ""; | ||
segmentStartTime = -1L; | ||
} | ||
} | ||
|
||
if (!StringUtil.isBlank(segmentBody)) { | ||
segmentBody = StringUtils.trim(segmentBody); | ||
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker)); | ||
} | ||
|
||
if (transcript.getSegmentCount() > 0) { | ||
return transcript; | ||
} else { | ||
return null; | ||
} | ||
|
||
} catch (org.json.JSONException e) { | ||
e.printStackTrace(); | ||
} | ||
return null; | ||
} | ||
} |
118 changes: 118 additions & 0 deletions
118
.../transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package de.danoeh.antennapod.parser.transcript; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.jsoup.internal.StringUtil; | ||
|
||
import java.util.Arrays; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import de.danoeh.antennapod.model.feed.Transcript; | ||
import de.danoeh.antennapod.model.feed.TranscriptSegment; | ||
|
||
public class SrtTranscriptParser { | ||
private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$"); | ||
|
||
public static Transcript parse(String str) { | ||
if (StringUtils.isBlank(str)) { | ||
return null; | ||
} | ||
str = str.replaceAll("\r\n", "\n"); | ||
|
||
Transcript transcript = new Transcript(); | ||
List<String> lines = Arrays.asList(str.split("\n")); | ||
Iterator<String> iter = lines.iterator(); | ||
String speaker = ""; | ||
StringBuilder body = new StringBuilder(); | ||
String line; | ||
String segmentBody = ""; | ||
long startTimecode = -1L; | ||
long spanStartTimecode = -1L; | ||
long endTimecode = -1L; | ||
long duration = 0L; | ||
|
||
while (iter.hasNext()) { | ||
line = iter.next(); | ||
|
||
if (line.isEmpty()) { | ||
continue; | ||
} | ||
|
||
if (line.contains("-->")) { | ||
String[] timecodes = line.split("-->"); | ||
if (timecodes.length < 2) { | ||
continue; | ||
} | ||
startTimecode = parseTimecode(timecodes[0].trim()); | ||
endTimecode = parseTimecode(timecodes[1].trim()); | ||
if (startTimecode == -1 || endTimecode == -1) { | ||
continue; | ||
} | ||
|
||
if (spanStartTimecode == -1) { | ||
spanStartTimecode = startTimecode; | ||
} | ||
duration += endTimecode - startTimecode; | ||
do { | ||
line = iter.next(); | ||
if (StringUtil.isBlank(line)) { | ||
break; | ||
} | ||
body.append(line.strip()); | ||
body.append(" "); | ||
} while (iter.hasNext()); | ||
} | ||
|
||
if (body.indexOf(":") != -1) { | ||
String [] parts = body.toString().trim().split(":"); | ||
if (parts.length < 2) { | ||
continue; | ||
} | ||
speaker = parts[0]; | ||
body = new StringBuilder(parts[1].strip()); | ||
} | ||
if (!StringUtil.isBlank(body.toString())) { | ||
segmentBody += " " + body; | ||
segmentBody = StringUtils.trim(segmentBody); | ||
if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) { | ||
transcript.addSegment(new TranscriptSegment(spanStartTimecode, | ||
endTimecode, | ||
segmentBody, | ||
speaker)); | ||
duration = 0L; | ||
spanStartTimecode = -1L; | ||
segmentBody = ""; | ||
} | ||
body = new StringBuilder(); | ||
} | ||
} | ||
|
||
if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) { | ||
segmentBody = StringUtils.trim(segmentBody); | ||
transcript.addSegment(new TranscriptSegment(spanStartTimecode, | ||
endTimecode, | ||
segmentBody, | ||
speaker)); | ||
} | ||
if (transcript.getSegmentCount() > 0) { | ||
return transcript; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
// Time format 00:00:00,000 | ||
static long parseTimecode(String timecode) { | ||
Matcher matcher = TIMECODE_PATTERN.matcher(timecode); | ||
if (!matcher.matches()) { | ||
return -1; | ||
} | ||
long hours = Integer.parseInt(matcher.group(1)); | ||
long minutes = Integer.parseInt(matcher.group(2)); | ||
long seconds = Integer.parseInt(matcher.group(3)); | ||
long milliseconds = Integer.parseInt(matcher.group(4)); | ||
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds; | ||
} | ||
} |
24 changes: 24 additions & 0 deletions
24
parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package de.danoeh.antennapod.parser.transcript; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
|
||
import de.danoeh.antennapod.model.feed.Transcript; | ||
|
||
public class TranscriptParser { | ||
static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second | ||
|
||
public static Transcript parse(String str, String type) { | ||
if (str == null || StringUtils.isBlank(str)) { | ||
return null; | ||
} | ||
|
||
if ("application/json".equals(type)) { | ||
return JsonTranscriptParser.parse(str); | ||
} | ||
|
||
if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) { | ||
return SrtTranscriptParser.parse(str); | ||
} | ||
return null; | ||
} | ||
} |
Oops, something went wrong.