Skip to content

Commit

Permalink
Merge pull request #264 from Spacelog/upgrade-to-python-3
Browse files Browse the repository at this point in the history
Upgrade to Python 3
  • Loading branch information
SteveMarshall committed Mar 30, 2024
2 parents 51e4792 + 1525410 commit d5a1cb4
Show file tree
Hide file tree
Showing 36 changed files with 402 additions and 401 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,17 @@ CMD ["overmind", "start"]
RUN apt-get update -qq && \
apt-get install -yq \
imagemagick optipng procps \
python python-pip python-xapian \
python3 python3-pip python3-xapian \
redis-server \
nginx
ENV PYTHONUNBUFFERED 1
RUN mkdir -p /src

WORKDIR /src
COPY requirements.txt /src/
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -r requirements.txt
COPY . /src/
ENV PYTHONIOENCODING utf-8:ignore
RUN /etc/init.d/redis-server start && \
make all && \
redis-cli shutdown save
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ website_scss_components = $(wildcard website/static/css/*/*.scss)
global_scss_sources = $(wildcard global/static/css/*.scss)
global_css_targets = $(patsubst %.scss,%.css,$(global_scss_sources))
global_scss_components = $(wildcard global/static/css/*/*.scss)
PYTHON ?= python
PYTHON ?= python3
SASS ?= pyscss

# Dev Django runserver variables
Expand Down
146 changes: 72 additions & 74 deletions backend/api.py

Large diffs are not rendered by default.

78 changes: 40 additions & 38 deletions backend/indexer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import with_statement

import os
import sys
import re
Expand All @@ -13,15 +13,15 @@

from backend.parser import TranscriptParser, MetaParser
from backend.api import Act, KeyScene, Character, Glossary, LogLine
from backend.util import seconds_to_timestamp
from backend.util import redis_connection, seconds_to_timestamp

search_db = xappy.IndexerConnection(
os.path.join(os.path.dirname(__file__), '..', 'xappydb'),
)

def mission_time_to_timestamp(mission_time):
"""Takes a mission time string (XX:XX:XX:XX) and converts it to a number of seconds"""
d,h,m,s = map(int, mission_time.split(':'))
d,h,m,s = list(map(int, mission_time.split(':')))
timestamp = d*86400 + h*3600 + m*60 + s

if mission_time[0] == "-":
Expand Down Expand Up @@ -96,7 +96,7 @@ def __init__(self, redis_conn, mission_name, transcript_name, parser):
xappy.FieldActions.STORE_CONTENT,
)
# Add names as synonyms for speaker identifiers
characters = Character.Query(self.redis_conn, self.mission_name).items()
characters = list(Character.Query(self.redis_conn, self.mission_name).items())
self.characters = {}
for character in characters:
self.characters[character.identifier] = character
Expand Down Expand Up @@ -146,10 +146,10 @@ def add_to_search_index(self, mission, id, chunk, weight, timestamp):
doc.fields.append(xappy.Field("speaker", line['speaker']))
doc.id = id
try:
search_db.replace(search_db.process(doc))
search_db.replace(doc)
except xappy.errors.IndexerError:
print "umm, error"
print id, lines
print("umm, error")
print(id, lines)
raise

def index(self):
Expand All @@ -171,8 +171,8 @@ def index(self):
for chunk in self.parser.get_chunks():
timestamp = chunk['timestamp']
log_line_id = "%s:%i" % (self.transcript_name, timestamp)
if timestamp <= previous_timestamp:
raise Exception, "%s should be after %s" % (seconds_to_timestamp(timestamp), seconds_to_timestamp(previous_timestamp))
if previous_timestamp and timestamp <= previous_timestamp:
raise Exception("%s should be after %s" % (seconds_to_timestamp(timestamp), seconds_to_timestamp(previous_timestamp)))
# See if there's transcript page info, and update it if so
if chunk['meta'].get('_page', 0):
current_transcript_page = int(chunk["meta"]['_page'])
Expand All @@ -185,7 +185,7 @@ def index(self):
if act.includes(timestamp):
break
else:
print "Error: No act for timestamp %s" % seconds_to_timestamp(timestamp)
print("Error: No act for timestamp %s" % seconds_to_timestamp(timestamp))
continue
# If we've filled up the current page, go to a new one
if current_page_lines >= self.LINES_PER_PAGE or (last_act is not None and last_act != act):
Expand Down Expand Up @@ -232,11 +232,11 @@ def index(self):
previous_log_line_id = log_line_id
previous_timestamp = timestamp
# Also store the text
text = u""
text = ""
for line in chunk['lines']:
self.redis_conn.rpush(
"log_line:%s:lines" % log_line_id,
u"%(speaker)s: %(text)s" % line,
"%(speaker)s: %(text)s" % line,
)
text += "%s %s" % (line['speaker'], line['text'])
# Store any images
Expand Down Expand Up @@ -267,7 +267,7 @@ def index(self):
# Read the new labels into current_labels
has_labels = False
if '_labels' in chunk['meta']:
for label, endpoint in chunk['meta']['_labels'].items():
for label, endpoint in list(chunk['meta']['_labels'].items()):
if endpoint is not None and label not in current_labels:
current_labels[label] = endpoint
elif label in current_labels:
Expand All @@ -279,7 +279,7 @@ def index(self):
self.redis_conn.sadd("label:%s" % label, log_line_id)
has_labels = True
# Expire any old labels
for label, endpoint in current_labels.items():
for label, endpoint in list(current_labels.items()):
if endpoint < chunk['timestamp']:
del current_labels[label]
# Apply any surviving labels
Expand All @@ -288,7 +288,7 @@ def index(self):
has_labels = True
# And add this logline to search index
if has_labels:
print "weight = 3 for %s" % log_line_id
print("weight = 3 for %s" % log_line_id)
weight = 3.0 # magic!
else:
weight = 1.0
Expand Down Expand Up @@ -316,9 +316,9 @@ def index(self):
self.transcript_name
)
if not pages_set and current_transcript_page:
print "%s original pages: %d" % (
print("%s original pages: %d" % (
self.transcript_name, current_transcript_page
)
))
self.redis_conn.hset(
"pages:%s" % self.mission_name,
self.transcript_name,
Expand All @@ -345,13 +345,13 @@ def index(self):
self.redis_conn.set("subdomain:%s" % subdomain, meta['name'])
del meta['subdomains']
utc_launch_time = meta['utc_launch_time']
if isinstance(utc_launch_time, basestring):
if isinstance(utc_launch_time, str):
# parse as something more helpful than a number
# time.mktime operates in the local timezone, so force that to UTC first
os.environ['TZ'] = 'UTC'
time.tzset()
utc_launch_time = int(time.mktime(time.strptime(utc_launch_time, "%Y-%m-%dT%H:%M:%S")))
print "Converted launch time to UTC timestamp:", utc_launch_time
print("Converted launch time to UTC timestamp:", utc_launch_time)
is_memorial = meta.get('memorial', False)
if is_memorial:
default_main_transcript = None
Expand All @@ -373,15 +373,15 @@ def index(self):
# TODO: Default to highest _page from transcript if we don't have this
transcript_pages = meta.get( 'transcript_pages' )
if transcript_pages:
print "Setting original pagecounts from _meta"
print("Setting original pagecounts from _meta")
self.redis_conn.hmset(
"pages:%s" % self.mission_name,
transcript_pages
)


copy = meta.get("copy", {})
for key, value in copy.items():
for key, value in list(copy.items()):
copy[key] = json.dumps(value)
if copy.get('based_on_header', None) is None:
copy['based_on_header'] = json.dumps('Based on the original transcript')
Expand Down Expand Up @@ -418,7 +418,7 @@ def index_narrative_elements(self, meta):
"%s:%i" % (self.mission_name, i),
)

data['start'], data['end'] = map(mission_time_to_timestamp, data['range'])
data['start'], data['end'] = list(map(mission_time_to_timestamp, data['range']))
del data['range']

self.redis_conn.hmset(key, data)
Expand All @@ -428,7 +428,7 @@ def index_narrative_elements(self, meta):
key = "act:%s:0" % (self.mission_name,)
title = meta.get('copy', {}).get('title', None)
if title is None:
title = meta.get('name', u'The Mission')
title = meta.get('name', 'The Mission')
else:
title = json.loads(title)
data = {
Expand Down Expand Up @@ -458,7 +458,7 @@ def index_characters(self, meta):
"character-ordering:%s" % self.mission_name,
identifier,
)
for identifier, data in meta.get('characters', {}).items():
for identifier, data in list(meta.get('characters', {}).items()):
mission_key = "characters:%s" % self.mission_name
character_key = "%s:%s" % (mission_key, identifier)

Expand Down Expand Up @@ -509,14 +509,17 @@ def index_glossary(self, meta):
shared_glossary_files_path = os.path.join(os.path.dirname( __file__ ), '..', 'missions', 'shared', 'glossary')

for filename in shared_glossary_files:
with open(os.path.join(shared_glossary_files_path, filename)) as fh:
with open(
os.path.join(shared_glossary_files_path, filename),
encoding="utf-8"
) as fh:
glossary_terms.update(json.load(fh))

# Add the mission specific glossary terms
glossary_terms.update(meta.get('glossary', {}))

# Add all the glossary terms to redis
for identifier, data in glossary_terms.items():
for identifier, data in list(glossary_terms.items()):
term_key = "%s:%s" % (self.mission_name, identifier.lower())

# Add the ID to the list for this mission
Expand All @@ -530,16 +533,16 @@ def index_glossary(self, meta):
data['abbr'] = identifier
data['times_mentioned'] = 0

if data.has_key('summary') and data.has_key('description'):
if 'summary' in data and 'description' in data:
data['extended_description'] = data['description']
data['description'] = data['summary']
del data['summary']
else:
data['description'] = data.get('summary') or data.get('description', u"")
if data.has_key('description_lang'):
data['description'] = data.get('summary') or data.get('description', "")
if 'description_lang' in data:
data['extended_description_lang'] = data['description_lang']
del data['description_lang']
if data.has_key('summary_lang'):
if 'summary_lang' in data:
data['description_lang'] = data['summary_lang']
del data['summary_lang']

Expand All @@ -557,12 +560,12 @@ def index_glossary(self, meta):

def index_special_searches(self, meta):
"Indexes things that in no way sound like 'feaster legs'."
for search, value in meta.get('special_searches', {}).items():
for search, value in list(meta.get('special_searches', {}).items()):
self.redis_conn.set("special_search:%s:%s" % (self.mission_name, search), value)

def index_errors(self, meta):
"Indexes error page info"
for key, info in meta.get('error_pages', {}).items():
for key, info in list(meta.get('error_pages', {}).items()):
self.redis_conn.hmset(
"error_page:%s:%s" % (self.mission_name, key),
info,
Expand All @@ -586,31 +589,30 @@ def index(self):
def index_transcripts(self):
for filename in os.listdir(self.folder_path):
if "." not in filename and filename[0] != "_" and filename[-1] != "~":
print "Indexing %s..." % filename
print("Indexing %s..." % filename)
path = os.path.join(self.folder_path, filename)
parser = TranscriptParser(path)
indexer = TranscriptIndexer(self.redis_conn, self.mission_name, "%s/%s" % (self.mission_name, filename), parser)
indexer.index()

def index_meta(self):
print "Indexing _meta..."
print("Indexing _meta...")
path = os.path.join(self.folder_path, "_meta")
parser = MetaParser(path)
indexer = MetaIndexer(self.redis_conn, self.mission_name, parser)
indexer.index()


if __name__ == "__main__":
redis_conn = redis.from_url(os.environ.get("REDIS_URL", "redis://localhost:6379"))
transcript_dir = os.path.join(os.path.dirname( __file__ ), '..', "missions")
dirs = os.listdir(transcript_dir)

print "Reindexing into database"
print("Reindexing into database")

for filename in dirs:
path = os.path.join(transcript_dir, filename)
if filename[0] not in "_." and os.path.isdir(path) and os.path.exists(os.path.join(path, "transcripts", "_meta")):
print "Mission: %s" % filename
idx = MissionIndexer(redis_conn, filename, os.path.join(path, "transcripts"))
print("Mission: %s" % filename)
idx = MissionIndexer(redis_connection, filename, os.path.join(path, "transcripts"))
idx.index()
search_db.flush()
30 changes: 15 additions & 15 deletions backend/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import with_statement

import string
try:
import json
Expand All @@ -16,7 +16,7 @@ def __init__(self, path):
self.path = path

def get_lines(self, offset):
with open(self.path) as fh:
with open(self.path, encoding="utf-8") as fh:
fh.seek(offset)
for line in fh:
yield line
Expand All @@ -36,11 +36,11 @@ def get_chunks(self, offset=0):
reuse_line = None
else:
try:
line = lines.next()
line = next(lines)
except StopIteration:
break
offset += len(line)
line = line.decode("utf8")
line = line
# If it's a comment or empty line, ignore it.
if not line.strip() or line.strip()[0] == "#":
continue
Expand All @@ -53,7 +53,7 @@ def get_chunks(self, offset=0):
try:
timestamp = timestamp_to_seconds(line[1:].split("]")[0])
except ValueError:
print "Error: invalid timestamp %s" % (line[1:], )
print("Error: invalid timestamp %s" % (line[1:], ))
raise
if current_chunk:
yield current_chunk
Expand All @@ -70,11 +70,11 @@ def get_chunks(self, offset=0):
name, blob = line.split(":", 1)
while True:
try:
line = lines.next()
line = next(lines)
except StopIteration:
break
offset += len(line)
line = line.decode("utf8")
line = line
if not line.strip() or line.strip()[0] == "#":
continue
if line[0] in string.whitespace:
Expand All @@ -91,18 +91,18 @@ def get_chunks(self, offset=0):
try:
data = json.loads('"%s"' % blob.replace('"', r'\"'))
except ValueError:
print "Error: Invalid json at timestamp %s, key %s" % \
(seconds_to_timestamp(timestamp), name)
print("Error: Invalid json at timestamp %s, key %s" % \
(seconds_to_timestamp(timestamp), name))
continue
current_chunk['meta'][name.strip()] = data
# If it's a continuation, append to the current line
elif line[0] in string.whitespace:
# Continuation line
if not current_chunk:
print "Error: Continuation line before first timestamp header. Line: %s" % \
(line)
print("Error: Continuation line before first timestamp header. Line: %s" % \
(line))
elif not current_chunk['lines']:
print "Error: Continuation line before first speaker name."
print("Error: Continuation line before first speaker name.")
else:
current_chunk['lines'][-1]['text'] += " " + line.strip()
# If it's a new line, start a new line. Shock.
Expand All @@ -111,7 +111,7 @@ def get_chunks(self, offset=0):
try:
speaker, text = line.split(":", 1)
except ValueError:
print "Error: First speaker line not in Name: Text format: %s." % (line,)
print("Error: First speaker line not in Name: Text format: %s." % (line,))
else:
line = {
"speaker": speaker.strip(),
Expand All @@ -126,8 +126,8 @@ class MetaParser(TranscriptParser):

def get_meta(self):
try:
with open(self.path) as fh:
with open(self.path, encoding="utf-8") as fh:
return json.load(fh)
except ValueError, e:
except ValueError as e:
raise ValueError("JSON decode error in file %s: %s" % (self.path, e))
return json.load(fh)

0 comments on commit d5a1cb4

Please sign in to comment.