Skip to content

Commit

Permalink
Merge pull request #2682 from chaoss/viz-api-patch-11
Browse files Browse the repository at this point in the history
Fix NULL Data Issue in 2 visualization endpoints
  • Loading branch information
sgoggins committed Feb 14, 2024
2 parents 788b75c + 84ab7b0 commit f31063d
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 104 deletions.
49 changes: 25 additions & 24 deletions augur/api/routes/pull_request_reports.py
Expand Up @@ -53,7 +53,7 @@ def pull_request_data_collection(repo_id, start_date, end_date):
( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response,
first_response_time,
last_response_time,
average_time_between_responses,
EXTRACT ( EPOCH FROM average_time_between_responses),
assigned_count,
review_requested_count,
labeled_count,
Expand All @@ -62,15 +62,15 @@ def pull_request_data_collection(repo_id, start_date, end_date):
referenced_count,
closed_count,
head_ref_force_pushed_count,
merged_count,
merged_count::INT,
milestoned_count,
unlabeled_count,
head_ref_deleted_count,
comment_count,
lines_added,
lines_removed,
COALESCE(lines_added, 0),
COALESCE(lines_removed, 0),
commit_count,
file_count
COALESCE(file_count, 0)
FROM
repo,
repo_groups,
Expand All @@ -87,46 +87,47 @@ def pull_request_data_collection(repo_id, start_date, end_date):
count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count,
count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count,
count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count,
count(*) FILTER (WHERE action = 'merged') AS merged_count,
MIN(message.msg_timestamp) AS first_response_time,
COUNT(DISTINCT message.msg_timestamp) AS comment_count,
MAX(message.msg_timestamp) AS last_response_time,
(MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp) AS average_time_between_responses
FROM pull_request_events, pull_requests, repo, pull_request_message_ref, message
WHERE repo.repo_id = {repo_id}
AND repo.repo_id = pull_requests.repo_id
AND pull_requests.pull_request_id = pull_request_events.pull_request_id
AND pull_requests.pull_request_id = pull_request_message_ref.pull_request_id
AND pull_request_message_ref.msg_id = message.msg_id
COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count,
COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time,
COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count,
COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time,
COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses
FROM pull_requests
LEFT OUTER JOIN pull_request_events on pull_requests.pull_request_id = pull_request_events.pull_request_id
JOIN repo on repo.repo_id = pull_requests.repo_id
LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id
LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id
WHERE repo.repo_id = 1
GROUP BY pull_requests.pull_request_id
) response_times
ON pull_requests.pull_request_id = response_times.pull_request_id
LEFT OUTER JOIN (
SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count FROM pull_request_commits, pull_requests, pull_request_meta
LEFT JOIN (
SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count
FROM pull_request_commits, pull_requests, pull_request_meta
WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id
AND pull_requests.pull_request_id = pull_request_meta.pull_request_id
AND pull_requests.repo_id = {repo_id}
AND pull_requests.repo_id = 1
AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha
AND pr_cmt_sha <> pull_request_meta.pr_sha
GROUP BY pull_request_commits.pull_request_id
) all_commit_counts
ON pull_requests.pull_request_id = all_commit_counts.pull_request_id
LEFT OUTER JOIN (
LEFT JOIN (
SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label
FROM pull_requests, pull_request_meta
WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id
AND pull_requests.repo_id = {repo_id}
AND pull_requests.repo_id = 1
AND pr_head_or_base = 'base'
GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label
) base_labels
ON base_labels.pull_request_id = all_commit_counts.pull_request_id
LEFT OUTER JOIN (
LEFT JOIN (
SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count
FROM pull_request_commits, commits, pull_requests, pull_request_meta
WHERE cmt_commit_hash = pr_cmt_sha
AND pull_requests.pull_request_id = pull_request_commits.pull_request_id
AND pull_requests.pull_request_id = pull_request_meta.pull_request_id
AND pull_requests.repo_id = {repo_id}
AND pull_requests.repo_id = 1
AND commits.repo_id = pull_requests.repo_id
AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha
AND commits.cmt_commit_hash <> pull_request_meta.pr_sha
Expand All @@ -136,7 +137,7 @@ def pull_request_data_collection(repo_id, start_date, end_date):
WHERE
repo.repo_group_id = repo_groups.repo_group_id
AND repo.repo_id = pull_requests.repo_id
AND repo.repo_id = {repo_id}
AND repo.repo_id = 1
ORDER BY
merged_count DESC
""")
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/_multicommand.py
Expand Up @@ -27,7 +27,7 @@ def get_command(self, ctx, name):
try:
module = importlib.import_module('.' + name, 'augur.application.cli')
return module.cli
except ModuleNotFoundError:
except ModuleNotFoundError as e:
pass

@click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS)
Expand Down
3 changes: 2 additions & 1 deletion augur/application/cli/backend.py
Expand Up @@ -19,7 +19,8 @@
from datetime import datetime

from augur import instance_id
from augur.tasks.start_tasks import augur_collection_monitor, CollectionState, create_collection_status_records
from augur.tasks.util.collection_state import CollectionState
from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records
from augur.tasks.git.facade_tasks import clone_repos
from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model
from augur.tasks.init.redis_connection import redis_connection
Expand Down
3 changes: 2 additions & 1 deletion augur/tasks/git/facade_tasks.py
Expand Up @@ -31,7 +31,8 @@
from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits

from augur.tasks.github.facade_github.tasks import *
from augur.tasks.util.collection_util import CollectionState, get_collection_status_repo_git_from_filter
from augur.tasks.util.collection_state import CollectionState
from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter
from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize


Expand Down
116 changes: 63 additions & 53 deletions augur/tasks/github/detect_move/core.py
Expand Up @@ -6,20 +6,24 @@
from augur.tasks.github.util.util import parse_json_response
import logging
from datetime import datetime
from enum import Enum
from augur.tasks.util.collection_state import CollectionState
from augur.application.db.util import execute_session_query

class CollectionState(Enum):
SUCCESS = "Success"
PENDING = "Pending"
ERROR = "Error"
COLLECTING = "Collecting"


def update_repo_with_dict(current_dict,new_dict,logger,db):

def update_repo_with_dict(repo,new_dict,logger,db):
"""
Update a repository record in the database using a dictionary tagged with
the appropriate table fields
Args:
repo: orm repo object to update
new_dict: dict of new values to add to the repo record
logger: logging object
db: db object
"""

to_insert = current_dict
to_insert = repo.__dict__
del to_insert['_sa_instance_state']
to_insert.update(new_dict)

Expand All @@ -45,7 +49,6 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook='

owner, name = get_owner_repo(repo.repo_git)
url = f"https://api.github.com/repos/{owner}/{name}"
current_repo_dict = repo.__dict__

attempts = 0
while attempts < 10:
Expand All @@ -56,64 +59,71 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook='

attempts += 1

#Mark as errored if not found
if response_from_gh.status_code == 404:
logger.error(f"Repo {repo.repo_git} responded 404 when pinged!")
#Update Url and retry if 301
#301 moved permanently
if response_from_gh.status_code == 301:

owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger)

try:
old_description = str(repo.description)
except Exception:
old_description = ""

#Create new repo object to update existing
repo_update_dict = {
'repo_git': repo.repo_git,
'repo_path': None,
'repo_name': None,
'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted",
'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')
'repo_git': f"https://github.com/{owner}/{name}",
'repo_path': None,
'repo_name': None,
'description': f"(Originally hosted at {url}) {old_description}"
}

update_repo_with_dict(current_repo_dict, repo_update_dict, logger, augur_db)

raise Exception(f"ERROR: Repo not found at requested host {repo.repo_git}")
elif attempts >= 10:
logger.warning(f"Could not check if repo moved because the api timed out 10 times. Url: {url}")
return

update_repo_with_dict(repo, repo_update_dict, logger,augur_db)

#skip if not moved
#301 moved permanently
if response_from_gh.status_code != 301:
logger.info(f"Repo found at url: {url}")
return
raise Exception("ERROR: Repo has moved! Resetting Collection!")

owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger)


try:
old_description = str(repo.description)
except:
old_description = ""
#Mark as ignore if 404
if response_from_gh.status_code == 404:
repo_update_dict = {
'repo_git': repo.repo_git,
'repo_path': None,
'repo_name': None,
'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted",
'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')
}

#Create new repo object to update existing
repo_update_dict = {
'repo_git': f"https://github.com/{owner}/{name}",
'repo_path': None,
'repo_name': None,
'description': f"(Originally hosted at {url}) {old_description}"
}
update_repo_with_dict(repo, repo_update_dict, logger, augur_db)

update_repo_with_dict(current_repo_dict, repo_update_dict, logger,augur_db)
statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id)

statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id)
collectionRecord = execute_session_query(statusQuery,'one')

collectionRecord = execute_session_query(statusQuery,'one')
if collection_hook == 'core':
collectionRecord.core_status = CollectionState.PENDING.value
collectionRecord.core_status = CollectionState.IGNORE.value
collectionRecord.core_task_id = None
collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')
elif collection_hook == 'secondary':
collectionRecord.secondary_status = CollectionState.PENDING.value

collectionRecord.secondary_status = CollectionState.IGNORE.value
collectionRecord.secondary_task_id = None
collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')

augur_db.session.commit()
collectionRecord.facade_status = CollectionState.IGNORE.value
collectionRecord.facade_task_id = None
collectionRecord.facade_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')

raise Exception("ERROR: Repo has moved! Marked repo as pending and stopped collection")
collectionRecord.ml_status = CollectionState.IGNORE.value
collectionRecord.ml_task_id = None
collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ')


augur_db.session.commit()
raise Exception("ERROR: Repo has moved! Resetting Collection!")


if attempts >= 10:
logger.error(f"Could not check if repo moved because the api timed out 10 times. Url: {url}")
raise Exception(f"ERROR: Could not get api response for repo: {url}")

#skip if not 404
logger.info(f"Repo found at url: {url}")
return

21 changes: 8 additions & 13 deletions augur/tasks/init/celery_app.py
Expand Up @@ -20,16 +20,7 @@
from augur.application.db.engine import get_database_string
from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string
from augur.application.db.models import CollectionStatus, Repo

class CollectionState(Enum):
SUCCESS = "Success"
PENDING = "Pending"
ERROR = "Error"
COLLECTING = "Collecting"
INITIALIZING = "Initializing"
UPDATE = "Update"
FAILED_CLONE = "Failed Clone"

from augur.tasks.util.collection_state import CollectionState

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -85,7 +76,7 @@ class CollectionState(Enum):
#Classes for tasks that take a repo_git as an argument.
class AugurCoreRepoCollectionTask(celery.Task):

def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core'):
def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value):
from augur.tasks.init.celery_app import engine

logger = AugurLogger(logger_name).get_logger()
Expand All @@ -104,7 +95,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h
prevStatus = getattr(repoStatus, f"{collection_hook}_status")

if prevStatus == CollectionState.COLLECTING.value or prevStatus == CollectionState.INITIALIZING.value:
setattr(repoStatus, f"{collection_hook}_status", CollectionState.ERROR.value)
setattr(repoStatus, f"{collection_hook}_status", after_fail)
setattr(repoStatus, f"{collection_hook}_task_id", None)
session.commit()

Expand All @@ -129,6 +120,7 @@ def on_failure(self,exc,task_id,args,kwargs,einfo):
repo_git = args[0]
self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml')


#task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask'
celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks)

Expand Down Expand Up @@ -209,7 +201,7 @@ def setup_periodic_tasks(sender, **kwargs):
"""
from celery.schedules import crontab
from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights
from augur.tasks.start_tasks import non_repo_domain_tasks
from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos
from augur.tasks.git.facade_tasks import clone_repos
from augur.tasks.db.refresh_materialized_views import refresh_materialized_views
from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model
Expand All @@ -234,6 +226,9 @@ def setup_periodic_tasks(sender, **kwargs):
logger.info(f"Scheduling update of collection weights on midnight each day")
sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s())

logger.info(f"Setting 404 repos to be marked for retry on midnight each day")
sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s())

logger.info(f"Scheduling contributor breadth every 30 days")
thirty_days_in_seconds = 30*24*60*60
sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s())
Expand Down

0 comments on commit f31063d

Please sign in to comment.