Merge pull request #60 from Mesnage-Org/metadata

Metadata
Mesnage-Org · Feb 11, 2022 · 8cffb48 · 8cffb48
2 parents 5a5d89e + 9351ada
commit 8cffb48
Show file tree

Hide file tree

Showing 16 changed files with 14,337 additions and 14,134 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ __pycache__/
 # outputs
 *Cleaned.csv
 eg_output.csv
+results*.csv
 
 #IDE
 .vscode/

diff --git a/data/baseline_output_ftrs.csv b/data/baseline_output_ftrs.csv
diff --git a/data/baseline_output_mq.csv b/data/baseline_output_mq.csv
diff --git a/data_dictionary.md b/data_dictionary.md
@@ -0,0 +1,68 @@
+# Data Dictionary
+
+*This document is a work in progress.*
+
+## Mass Lists
+
+Format: `CSV` (`.csv`)
+
+| Column | Description | Unit |
+|---|---|---|
+| Structure | Structure code | NA |
+| Monoisotopicmass | Monoisotopic mass | *TBD* |
+
+## FTRS Input Files
+
+Format: `.ftrs`
+
+## MaxQuant Input Files
+
+Format: `TSV` (`.txt`)
+
+## FTRS Output Files
+
+Format: `CSV` (`.csv`)
+
+The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:
+
+| Column | Description | Unit |
+|---|---|---|
+| ID | *TBD* | *TBD* |
+| xicStart | *TBD* | *TBD* |
+| xicEnd | *TBD* | *TBD* |
+| feature | *TBD* | *TBD* |
+| corrMax | *TBD* | *TBD* |
+| ionCount | *TBD* | *TBD* |
+| chargeOrder | *TBD* | *TBD* |
+| maxIsotopeCount | *TBD* | *TBD* |
+| rt | *TBD* | *TBD* |
+| mwMonoisotopic | *TBD* | *TBD* |
+| theo_mwMonoisotopic | *TBD* | *TBD* |
+| inferredStructure | *TBD* | *TBD* |
+| maxIntensity | *TBD* | *TBD* |
+
+## MaxQuant Output Files
+
+Format: `CSV` (`.csv`)
+
+The column name of the first column contains [embedded metadata](#embedded-metadata) on the provenance of the file. Subsequent columns are defined as follows:
+
+| Column | Description | Unit |
+|---|---|---|
+| ID | *TBD* | *TBD* |
+| rt | *TBD* | *TBD* |
+| rt_length | *TBD* | *TBD* |
+| mwMonoisotopic | *TBD* | *TBD* |
+| theo_mwMonoisotopic | *TBD* | *TBD* |
+| inferredStructure | *TBD* | *TBD* |
+| maxIntensity | *TBD* | *TBD* |
+
+## Embedded Metadata
+
+| Data | Description |
+|---|---|
+| file | Input data file |
+| masses_file | Mass list file |
+| modifications | List of modifications (*TBD*) |
+| ppm | ppm tolerance (*TBD*) |
+| rt_window | *TBD* |
diff --git a/demo.py b/demo.py
@@ -1,18 +1,21 @@
 import pgfinder.matching as matching
+import pgfinder.pgio as pgio
 import pgfinder.validation as validation
 
-csv_filepath = "data/masses/e_coli_monomer_masses.csv"
 mq_filepath = "data/maxquant_test_data.txt"
+csv_filepath = "data/masses/e_coli_monomer_masses.csv"
 
-raw_data = matching.maxquant_file_reader(mq_filepath)
-validation.validate_raw_data_df(raw_data)
+masses = pgio.ms_file_reader(mq_filepath)
+validation.validate_raw_data_df(masses)
 
-theo_masses = matching.theo_masses_reader(csv_filepath)
+theo_masses = pgio.theo_masses_reader(csv_filepath)
 validation.validate_theo_masses_df(theo_masses)
 
 mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']
-validation.validate_enabled_mod_list(mod_test)
 
-results = matching.data_analysis(raw_data, theo_masses, 0.5, mod_test, 10)
+results = matching.data_analysis(masses, theo_masses, 0.5, mod_test, 10)
+
+print(results.attrs['metadata'])
+print(results)
 
-print(results)
+pgio.dataframe_to_csv_metadata(save_filepath='./', output_dataframe=results)
diff --git a/demo_ftrs.py b/demo_ftrs.py
@@ -1,18 +1,21 @@
 import pgfinder.matching as matching
+import pgfinder.pgio as pgio
 import pgfinder.validation as validation
 
-csv_filepath = "data/masses/e_coli_monomer_masses.csv"
 ftrs_filepath = "data/ftrs_test_data.ftrs"
+csv_filepath = "data/masses/e_coli_monomer_masses.csv"
 
-raw_data = matching.ftrs_reader(ftrs_filepath)
-validation.validate_raw_data_df(raw_data)
+masses = pgio.ms_file_reader(ftrs_filepath)
+validation.validate_raw_data_df(masses)
 
-theo_masses = matching.theo_masses_reader(csv_filepath)
+theo_masses = pgio.theo_masses_reader(csv_filepath)
 validation.validate_theo_masses_df(theo_masses)
 
 mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']
-validation.validate_enabled_mod_list(mod_test)
 
-results = matching.data_analysis(raw_data, theo_masses, 0.5, mod_test, 10)
+results = matching.data_analysis(masses, theo_masses, 0.5, mod_test, 10)
+
+print(results.attrs['metadata'])
+print(results)
 
-print(results)
+pgio.dataframe_to_csv_metadata(save_filepath='./', output_dataframe=results)
diff --git a/make_baseline.py b/make_baseline.py
@@ -1,18 +1,22 @@
 import pgfinder.matching as matching
+import pgfinder.pgio as pgio
+import pgfinder.validation as validation
 
 # Set mass database and modifications
 csv_filepath = "data/masses/e_coli_monomer_masses.csv"
-theo_masses = matching.theo_masses_reader(csv_filepath)
+theo_masses = pgio.theo_masses_reader(csv_filepath)
 mod_test = ['Sodium','Potassium','Anhydro','DeAc','Deacetyl_Anhydro','Nude','Decay','Amidation','Amidase','Double_Anh','multimers_Glyco']
 
 # Generate maxquant baseline
-mq_file_name = "data/maxquant_test_data.txt"
-raw_data_mq = matching.maxquant_file_reader(mq_file_name)
-results = matching.data_analysis(raw_data_mq, theo_masses, 0.5, mod_test, 10)
-results.to_csv("data/baseline_output_mq.csv")
+mq_filepath = "data/maxquant_test_data.txt"
+masses_mq = pgio.ms_file_reader(mq_filepath)
+validation.validate_raw_data_df(masses_mq)
+results = matching.data_analysis(masses_mq, theo_masses, 0.5, mod_test, 10)
+pgio.dataframe_to_csv_metadata(save_filepath='./data/', output_dataframe=results, filename='baseline_output_mq.csv')
 
 # Generate ftrs baseline
-ftrs_file_name = "data/ftrs_test_data.ftrs"
-raw_data_ftrs = matching.ftrs_reader(ftrs_file_name)
-results = matching.data_analysis(raw_data_ftrs, theo_masses, 0.5, mod_test, 10)
-results.to_csv("data/baseline_output_ftrs.csv")
+ftrs_filepath = "data/ftrs_test_data.ftrs"
+masses_ftrs = pgio.ms_file_reader(ftrs_filepath)
+validation.validate_raw_data_df(masses_ftrs)
+results = matching.data_analysis(masses_ftrs, theo_masses, 0.5, mod_test, 10)
+pgio.dataframe_to_csv_metadata(save_filepath='./data/', output_dataframe=results, filename='baseline_output_ftrs.csv')
diff --git a/pgfinder.ipynb b/pgfinder.ipynb
@@ -7,7 +7,7 @@
     "# pgfinder End-to-end Demo\n",
     "\n",
     "## Introduction\n",
-    "This is a demo of running pgfinder end-to-end with data held in the [pgfinder Repository on GitHub](https://github.com/Mesnage-Org/Mass-Spec-pgfinder-Analysis/).\n",
+    "This is a demo of running pgfinder end-to-end with data held in the [pgfinder Repository on GitHub](https://github.com/Mesnage-Org/Mass-Spec-pgfinder-Analysis/). This notebook runs on the latest release. Please review the [release notes](https://github.com/Mesnage-Org/pgfinder/releases).\n",
     "\n",
     "## How to run it:\n",
     "If you're running this via `mybinder` click the `Cell` menu, then `Run All`."
@@ -28,7 +28,8 @@
    "source": [
     "from ipysheet import from_dataframe\n",
     "import pandas as pd\n",
-    "import pgfinder.matching as matching"
+    "import pgfinder.matching as matching\n",
+    "import pgfinder.pgio as pgio"
    ]
   },
   {
@@ -72,7 +73,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "raw_data = matching.maxquant_file_reader(mq_filepath)\n",
+    "raw_data = pgio.ms_file_reader(mq_filepath)\n",
     "display(from_dataframe(raw_data.head(10)))"
    ]
   },
@@ -144,9 +145,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.10"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/pgfinder/matching.py b/pgfinder/matching.py
@@ -1,9 +1,6 @@
-import pandas as pd
-import sqlite3
-import numpy as np
 from decimal import *
 
-
+import pandas as pd
 
 def calc_ppm_tolerance(mw: float, ppm_tol: int = 10):
     '''
@@ -288,89 +285,13 @@ def clean_up(ftrs_df, mass_to_clean: Decimal, time_delta: float):
 
     return consolidated_decay_df
 
-
-def ftrs_reader(filePath: str):
-    '''
-    Reads FTRS file from Byos
-    :param filePath:
-    :return dataframe:
-    '''
-
-    with sqlite3.connect(filePath) as db:
-
-        sql = "SELECT * FROM Features"
-        #Reads sql database into dataframe
-        ff = pd.read_sql(sql, db)
-        # adds inferredStructure column
-        ff['inferredStructure'] = np.nan
-        # adds theo_mwMonoisotopic column
-        ff['theo_mwMonoisotopic'] = np.nan
-        # Renames columns to expected column heading required for data_analysis function
-        ff.rename(columns={'Id': 'ID', 'apexRetentionTimeMinutes': 'rt', 'apexMwMonoisotopic': 'mwMonoisotopic', 'maxAveragineCorrelation': 'corrMax' }, inplace=True)
-        # Desired column order
-        cols_order = ['ID', 'xicStart', 'xicEnd', 'feature', 'corrMax', 'ionCount', 'chargeOrder', 'maxIsotopeCount',
-                      'rt', 'mwMonoisotopic','theo_mwMonoisotopic', 'inferredStructure', 'maxIntensity', ]
-        # Reorder columns in dataframe to desired order.
-        ff = ff[cols_order]
-
-        return ff
-
-
-def maxquant_file_reader(filepath: str):
-    '''
-        Reads maxquant files and outputs data as a dataframe
-
-    :param filepath (file should be a text file):
-    :return dataframe:
-    '''
-
-    # reads file into dataframe
-    maxquant_df = pd.read_table(filepath, low_memory=False)
-    # adds inferredStructure column
-    maxquant_df['inferredStructure'] = np.nan
-    # adds theo_mwMonoisotopic column
-    maxquant_df['theo_mwMonoisotopic'] = np.nan
-    # insert dataframe index as a column
-    maxquant_df.reset_index(level=0, inplace=True)
-    # Renames columns to expected column heading required for data_analysis function
-    maxquant_df.rename(columns={'index': 'ID','Retention time': 'rt', 'Retention length': 'rt_length',
-                                'Mass': 'mwMonoisotopic', 'Intensity': "maxIntensity"},
-                               inplace=True)
-    # Keeps only essential columns, all extraneous columns are left out.
-    focused_maxquant_df = maxquant_df[['ID', 'mwMonoisotopic', 'rt', 'rt_length', 'maxIntensity', 'inferredStructure',
-                                       'theo_mwMonoisotopic']]
-    # Desired column order
-    cols_order = ['ID', 'rt', 'rt_length', 'mwMonoisotopic', 'theo_mwMonoisotopic', 'inferredStructure', 'maxIntensity']
-    # Reorder columns in dataframe to desired order.
-    focused_maxquant_df = focused_maxquant_df[cols_order]
-
-
-    return focused_maxquant_df
-
-
-def theo_masses_reader(filepath: str):
-
-    '''
-    Reads theoretical masses files (csv)
-    :param filepath:
-    :return dataframe:
-    '''
-    # reads csv files and converts to dataframe
-    theo_masses_df = pd.read_csv(filepath)
-
-    return theo_masses_df
-
-
 def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_window: float, enabled_mod_list: list, user_ppm = int):
 
     sugar = Decimal('203.0793')
     sodium = Decimal('21.9819')
     potassium = Decimal('37.9559')
     time_delta_window = rt_window #retention time window to look in for in source decay products (rt of parent ion plus or minus time_delta)
 
-
-
-
     theo = theo_masses_df
     ff = raw_data_df
 
@@ -455,8 +376,6 @@ def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_wi
     else:
         double_Anhydro_df = pd.DataFrame()
 
-
-
     master_frame = [obs_theo_df, adducts_potassium_df, adducts_sodium_df, anhydro_df, deac_anhy_df, deacetyl_df,
                     oacetyl_df, decay_df, nude_df, ami_df, deglyco_df, double_Anhydro_df]
     master_list = pd.concat(master_frame)
@@ -470,19 +389,11 @@ def data_analysis(raw_data_df: pd.DataFrame, theo_masses_df: pd.DataFrame, rt_wi
 
     cleaned_data_df.sort_values('inferredStructure', inplace=True, ascending=True)
 
-    return cleaned_data_df
-
-def dataframe_to_csv(save_filepath: str, filename:str ,output_dataframe: pd.DataFrame ):
-    '''
-    Writes dataframe to csv file at desired file location
-    :param save_filepath:
-    :param filename:
-    :param output_dataframe:
-    :return csv file:
-    '''
-
-    #Combine save location and desired file name with correct formatting for output as csv file.
-    write_location = save_filepath + '/' + filename + '.csv'
-    output_dataframe.to_csv(write_location, index=False)
-
+    # set metadata
+    cleaned_data_df.attrs['file'] = raw_data_df.attrs['file']
+    cleaned_data_df.attrs['masses_file'] = theo_masses_df.attrs['file']
+    cleaned_data_df.attrs['rt_window'] = rt_window
+    cleaned_data_df.attrs['modifications'] = enabled_mod_list
+    cleaned_data_df.attrs['ppm'] = user_ppm
 
+    return cleaned_data_df