Skip to content

Commit

Permalink
Merge pull request #60 from Mesnage-Org/metadata
Browse files Browse the repository at this point in the history
Metadata
  • Loading branch information
bobturneruk committed Feb 11, 2022
2 parents 5a5d89e + 9351ada commit 8cffb48
Show file tree
Hide file tree
Showing 16 changed files with 14,337 additions and 14,134 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -10,6 +10,7 @@ __pycache__/
# outputs
*Cleaned.csv
eg_output.csv
results*.csv

#IDE
.vscode/
Expand Down
7,974 changes: 3,987 additions & 3,987 deletions data/baseline_output_ftrs.csv

Large diffs are not rendered by default.

19,914 changes: 9,957 additions & 9,957 deletions data/baseline_output_mq.csv

Large diffs are not rendered by default.

68 changes: 68 additions & 0 deletions data_dictionary.md
@@ -0,0 +1,68 @@
# Data Dictionary

*This document is a work in progress.*

## Mass Lists

Format: `CSV` (`.csv`)

| Column | Description | Unit |
|---|---|---|
| Structure | Structure code | NA |
| Monoisotopicmass | Monoisotopic mass | *TBD* |

## FTRS Input Files

Format: `.ftrs`

## MaxQuant Input Files

Format: `TSV` (`.txt`)

## FTRS Output Files

Format: `CSV` (`.csv`)

The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:

| Column | Description | Unit |
|---|---|---|
| ID | *TBD* | *TBD* |
| xicStart | *TBD* | *TBD* |
| xicEnd | *TBD* | *TBD* |
| feature | *TBD* | *TBD* |
| corrMax | *TBD* | *TBD* |
| ionCount | *TBD* | *TBD* |
| chargeOrder | *TBD* | *TBD* |
| maxIsotopeCount | *TBD* | *TBD* |
| rt | *TBD* | *TBD* |
| mwMonoisotopic | *TBD* | *TBD* |
| theo_mwMonoisotopic | *TBD* | *TBD* |
| inferredStructure | *TBD* | *TBD* |
| maxIntensity | *TBD* | *TBD* |

## MaxQuant Output Files

Format: `CSV` (`.csv`)

The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:

| Column | Description | Unit |
|---|---|---|
| ID | *TBD* | *TBD* |
| rt | *TBD* | *TBD* |
| rt_length | *TBD* | *TBD* |
| mwMonoisotopic | *TBD* | *TBD* |
| theo_mwMonoisotopic | *TBD* | *TBD* |
| inferredStructure | *TBD* | *TBD* |
| maxIntensity | *TBD* | *TBD* |

## Embedded Metadata

| Data | Description |
|---|---|
| file | Input data file |
| masses_file | Mass list file |
| modifications | List of modifications (*TBD*) |
| ppm | ppm tolerance (*TBD*) |
| rt_window | *TBD* |
17 changes: 10 additions & 7 deletions demo.py
@@ -1,18 +1,21 @@
import pgfinder.matching as matching
import pgfinder.pgio as pgio
import pgfinder.validation as validation

csv_filepath = "data/masses/e_coli_monomer_masses.csv"
mq_filepath = "data/maxquant_test_data.txt"
csv_filepath = "data/masses/e_coli_monomer_masses.csv"

raw_data = matching.maxquant_file_reader(mq_filepath)
validation.validate_raw_data_df(raw_data)
masses = pgio.ms_file_reader(mq_filepath)
validation.validate_raw_data_df(masses)

theo_masses = matching.theo_masses_reader(csv_filepath)
theo_masses = pgio.theo_masses_reader(csv_filepath)
validation.validate_theo_masses_df(theo_masses)

mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']
validation.validate_enabled_mod_list(mod_test)

results = matching.data_analysis(raw_data, theo_masses, 0.5, mod_test, 10)
results = matching.data_analysis(masses, theo_masses, 0.5, mod_test, 10)

print(results.attrs['metadata'])
print(results)

print(results)
pgio.dataframe_to_csv_metadata(save_filepath='./', output_dataframe=results)
17 changes: 10 additions & 7 deletions demo_ftrs.py
@@ -1,18 +1,21 @@
import pgfinder.matching as matching
import pgfinder.pgio as pgio
import pgfinder.validation as validation

csv_filepath = "data/masses/e_coli_monomer_masses.csv"
ftrs_filepath = "data/ftrs_test_data.ftrs"
csv_filepath = "data/masses/e_coli_monomer_masses.csv"

raw_data = matching.ftrs_reader(ftrs_filepath)
validation.validate_raw_data_df(raw_data)
masses = pgio.ms_file_reader(ftrs_filepath)
validation.validate_raw_data_df(masses)

theo_masses = matching.theo_masses_reader(csv_filepath)
theo_masses = pgio.theo_masses_reader(csv_filepath)
validation.validate_theo_masses_df(theo_masses)

mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']
validation.validate_enabled_mod_list(mod_test)

results = matching.data_analysis(raw_data, theo_masses, 0.5, mod_test, 10)
results = matching.data_analysis(masses, theo_masses, 0.5, mod_test, 10)

print(results.attrs['metadata'])
print(results)

print(results)
pgio.dataframe_to_csv_metadata(save_filepath='./', output_dataframe=results)
22 changes: 13 additions & 9 deletions make_baseline.py
@@ -1,18 +1,22 @@
import pgfinder.matching as matching
import pgfinder.pgio as pgio
import pgfinder.validation as validation

# Set mass database and modifications
csv_filepath = "data/masses/e_coli_monomer_masses.csv"
theo_masses = matching.theo_masses_reader(csv_filepath)
theo_masses = pgio.theo_masses_reader(csv_filepath)
mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']

# Generate maxquant baseline
mq_file_name = "data/maxquant_test_data.txt"
raw_data_mq = matching.maxquant_file_reader(mq_file_name)
results = matching.data_analysis(raw_data_mq, theo_masses, 0.5, mod_test, 10)
results.to_csv("data/baseline_output_mq.csv")
mq_filepath = "data/maxquant_test_data.txt"
masses_mq = pgio.ms_file_reader(mq_filepath)
validation.validate_raw_data_df(masses_mq)
results = matching.data_analysis(masses_mq, theo_masses, 0.5, mod_test, 10)
pgio.dataframe_to_csv_metadata(save_filepath='./data/', output_dataframe=results, filename='baseline_output_mq.csv')

# Generate ftrs baseline
ftrs_file_name = "data/ftrs_test_data.ftrs"
raw_data_ftrs = matching.ftrs_reader(ftrs_file_name)
results = matching.data_analysis(raw_data_ftrs, theo_masses, 0.5, mod_test, 10)
results.to_csv("data/baseline_output_ftrs.csv")
ftrs_filepath = "data/ftrs_test_data.ftrs"
masses_ftrs = pgio.ms_file_reader(ftrs_filepath)
validation.validate_raw_data_df(masses_ftrs)
results = matching.data_analysis(masses_ftrs, theo_masses, 0.5, mod_test, 10)
pgio.dataframe_to_csv_metadata(save_filepath='./data/', output_dataframe=results, filename='baseline_output_ftrs.csv')
11 changes: 6 additions & 5 deletions pgfinder.ipynb
Expand Up @@ -7,7 +7,7 @@
"# pgfinder End-to-end Demo\n",
"\n",
"## Introduction\n",
"This is a demo of running pgfinder end-to-end with data held in the [pgfinder Repository on GitHub](https://github.com/Mesnage-Org/Mass-Spec-pgfinder-Analysis/).\n",
"This is a demo of running pgfinder end-to-end with data held in the [pgfinder Repository on GitHub](https://github.com/Mesnage-Org/Mass-Spec-pgfinder-Analysis/). This notebook runs on the latest release. Please review the [release notes](https://github.com/Mesnage-Org/pgfinder/releases).\n",
"\n",
"## How to run it:\n",
"If you're running this via `mybinder` click the `Cell` menu, then `Run All`."
Expand All @@ -28,7 +28,8 @@
"source": [
"from ipysheet import from_dataframe\n",
"import pandas as pd\n",
"import pgfinder.matching as matching"
"import pgfinder.matching as matching\n",
"import pgfinder.pgio as pgio"
]
},
{
Expand Down Expand Up @@ -72,7 +73,7 @@
"metadata": {},
"outputs": [],
"source": [
"raw_data = matching.maxquant_file_reader(mq_filepath)\n",
"raw_data = pgio.ms_file_reader(mq_filepath)\n",
"display(from_dataframe(raw_data.head(10)))"
]
},
Expand Down Expand Up @@ -144,9 +145,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
105 changes: 8 additions & 97 deletions pgfinder/matching.py
@@ -1,9 +1,6 @@
import pandas as pd
import sqlite3
import numpy as np
from decimal import *


import pandas as pd

def calc_ppm_tolerance(mw: float, ppm_tol: int = 10):
'''
Expand Down Expand Up @@ -288,89 +285,13 @@ def clean_up(ftrs_df, mass_to_clean: Decimal, time_delta: float):

return consolidated_decay_df


def ftrs_reader(filePath: str):
'''
Reads FTRS file from Byos
:param filePath:
:return dataframe:
'''

with sqlite3.connect(filePath) as db:

sql = "SELECT * FROM Features"
#Reads sql database into dataframe
ff = pd.read_sql(sql, db)
# adds inferredStructure column
ff['inferredStructure'] = np.nan
# adds theo_mwMonoisotopic column
ff['theo_mwMonoisotopic'] = np.nan
# Renames columns to expected column heading required for data_analysis function
ff.rename(columns={'Id': 'ID', 'apexRetentionTimeMinutes': 'rt', 'apexMwMonoisotopic': 'mwMonoisotopic', 'maxAveragineCorrelation': 'corrMax' }, inplace=True)
# Desired column order
cols_order = ['ID', 'xicStart', 'xicEnd', 'feature', 'corrMax', 'ionCount', 'chargeOrder', 'maxIsotopeCount',
'rt', 'mwMonoisotopic','theo_mwMonoisotopic', 'inferredStructure', 'maxIntensity', ]
# Reorder columns in dataframe to desired order.
ff = ff[cols_order]

return ff


def maxquant_file_reader(filepath: str):
'''
Reads maxquant files and outputs data as a dataframe
:param filepath (file should be a text file):
:return dataframe:
'''

# reads file into dataframe
maxquant_df = pd.read_table(filepath, low_memory=False)
# adds inferredStructure column
maxquant_df['inferredStructure'] = np.nan
# adds theo_mwMonoisotopic column
maxquant_df['theo_mwMonoisotopic'] = np.nan
# insert dataframe index as a column
maxquant_df.reset_index(level=0, inplace=True)
# Renames columns to expected column heading required for data_analysis function
maxquant_df.rename(columns={'index': 'ID','Retention time': 'rt', 'Retention length': 'rt_length',
'Mass': 'mwMonoisotopic', 'Intensity': "maxIntensity"},
inplace=True)
# Keeps only essential columns, all extraneous columns are left out.
focused_maxquant_df = maxquant_df[['ID', 'mwMonoisotopic', 'rt', 'rt_length', 'maxIntensity', 'inferredStructure',
'theo_mwMonoisotopic']]
# Desired column order
cols_order = ['ID', 'rt', 'rt_length', 'mwMonoisotopic', 'theo_mwMonoisotopic', 'inferredStructure', 'maxIntensity']
# Reorder columns in dataframe to desired order.
focused_maxquant_df = focused_maxquant_df[cols_order]


return focused_maxquant_df


def theo_masses_reader(filepath: str):

'''
Reads theoretical masses files (csv)
:param filepath:
:return dataframe:
'''
# reads csv files and converts to dataframe
theo_masses_df = pd.read_csv(filepath)

return theo_masses_df


def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_window: float, enabled_mod_list: list, user_ppm = int):

sugar = Decimal('203.0793')
sodium = Decimal('21.9819')
potassium = Decimal('37.9559')
time_delta_window = rt_window #retention time window to look in for in source decay products (rt of parent ion plus or minus time_delta)




theo = theo_masses_df
ff = raw_data_df

Expand Down Expand Up @@ -455,8 +376,6 @@ def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_wi
else:
double_Anhydro_df = pd.DataFrame()



master_frame = [obs_theo_df, adducts_potassium_df, adducts_sodium_df, anhydro_df, deac_anhy_df, deacetyl_df,
oacetyl_df, decay_df, nude_df, ami_df, deglyco_df, double_Anhydro_df]
master_list = pd.concat(master_frame)
Expand All @@ -470,19 +389,11 @@ def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_wi

cleaned_data_df.sort_values('inferredStructure', inplace=True, ascending=True)

return cleaned_data_df

def dataframe_to_csv(save_filepath: str, filename:str ,output_dataframe: pd.DataFrame ):
'''
Writes dataframe to csv file at desired file location
:param save_filepath:
:param filename:
:param output_dataframe:
:return csv file:
'''

#Combine save location and desired file name with correct formatting for output as csv file.
write_location = save_filepath + '/' + filename + '.csv'
output_dataframe.to_csv(write_location, index=False)

# set metadata
cleaned_data_df.attrs['file'] = raw_data_df.attrs['file']
cleaned_data_df.attrs['masses_file'] = theo_masses_df.attrs['file']
cleaned_data_df.attrs['rt_window'] = rt_window
cleaned_data_df.attrs['modifications'] = enabled_mod_list
cleaned_data_df.attrs['ppm'] = user_ppm

return cleaned_data_df

0 comments on commit 8cffb48

Please sign in to comment.