Merge branch 'release/3.0.0'

dermatologist · Oct 23, 2020 · 1b5aa03 · 1b5aa03
2 parents 8bf264f + 5820fa2
commit 1b5aa03
Show file tree

Hide file tree

Showing 26 changed files with 417 additions and 46 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,33 @@
+name: Python Test
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    branches:
+      - master
+      - develop
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 4
+      matrix:
+        python-version: [3.7]
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml
@@ -0,0 +1,30 @@
+name: Python Test
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 4
+      matrix:
+        python-version: [3.7]
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install tox
+    - name: Test with tox
+      run: |
+        tox
diff --git a/.gitignore b/.gitignore
@@ -48,5 +48,4 @@ MANIFEST
 
 # Per-project virtualenvs
 .venv*/
-.vscode
 cdm6.sqlite
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,11 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Pytest",
+            "type": "python",
+            "request": "launch",
+            "module": "pytest"
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+    "licenser.author": "Bell Eapen",
+    "licenser.projectName": "PyOMOP",
+    "licenser.license": "GPLv3",
+    "licenser.useSingleLineStyle": false,
+    "cSpell.words": [
+        "OMOP",
+        "pyomop"
+    ]
+}
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,9 +2,9 @@
 Changelog
 =========
 
-Version 0.1
+Version 3.0
 ===========
 
-- Feature A added
-- FIX: nasty bug #1729 fixed
-- add your changes here!
+- FIX: Person table
+- add tox test suite
+- FEAT: vector table
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # pyomop
 
-OMOP CDM utils
+OMOP CDM utils. This repo may be similar to [@jbadger3's](https://github.com/jbadger3) [inspectomop](https://github.com/jbadger3/inspectomop), but this is not a fork.
 
 ## Description
 
-The [OHSDI](https://www.ohdsi.org/) OMOP Common Data Model allows for the systematic analysis of healthcare observational databases. This is a python library to use the CDM v6 compliant databases.
+The [OHSDI](https://www.ohdsi.org/) OMOP Common Data Model allows for the systematic analysis of healthcare observational databases. This is a python library to use the CDM v6 compliant databases using SQLAlchemy as the ORM. **pyomop** also supports converting query results to a pandas dataframe (see below) for use in machine learning pipelines. See some useful [SQL Queries here.](https://github.com/OHDSI/QueryLibrary)
 
 ### Support
 * Postgres
@@ -22,33 +22,49 @@ pip install pyomop
 ## Usage
 
 ```
-from pyomop import CdmEngineFactory, CdmVocabulary, Cohort, Vocabulary, metadata
+
+from pyomop import CdmEngineFactory, CdmVocabulary, CdmVector, Cohort, Vocabulary, metadata
 from sqlalchemy.sql import select
 import datetime
 
 cdm = CdmEngineFactory()  # Creates SQLite database by default
 
+# Postgres example (db='mysql' also supported)
+# cdm = CdmEngineFactory(db='pgsql', host='', port=5432,
+#                       user='', pw='',
+#                       name='', schema='cdm6')
+
+
 engine = cdm.engine
-# Create Tables 
+# Create Tables if required
 metadata.create_all(engine)
-# Create vocabulary
+# Create vocabulary if required
 vocab = CdmVocabulary(cdm)
 # vocab.create_vocab('/path/to/csv/files')  # Uncomment to load vocabulary csv files
 
 # SQLAlchemy as ORM
 session =  cdm.session
-session.add(Cohort(cohort_definition_id=2, subject_id=100, 
-            cohort_end_date=datetime.datetime.now(), 
+session.add(Cohort(cohort_definition_id=2, subject_id=100,
+            cohort_end_date=datetime.datetime.now(),
             cohort_start_date=datetime.datetime.now()))
 session.commit()
 
-s = select([Cohort])
-result = session.execute(s)
+result = session.query(Cohort).all()
 for row in result:
     print(row)
-result.close()
-for v in session.query(Vocabulary).order_by(Vocabulary.vocabulary_name):
-    print(v.vocabulary_name)
+
+# Convert result to a pandas dataframe
+vec = CdmVector()
+vec.result = result
+print(vec.df.dtypes)
+
+# Execute a query and convert it to dataframe
+vec.sql_df(cdm, 'TEST') # TEST is defined in sqldict.py
+print(vec.df.dtypes) # vec.df is a pandas dataframe
+# OR
+vec.sql_df(cdm, query='SELECT * from cohort')
+print(vec.df.dtypes) # vec.df is a pandas dataframe
+
 
 ```
 
@@ -58,10 +74,7 @@ for v in session.query(Vocabulary).order_by(Vocabulary.vocabulary_name):
 pyomop -help
 ```
 
-## What to expect
-
-* Integration with machine learning libraries
 
 ## Contributors
 
-* [Bell Eapen](https://nuchange.ca)
+* [Bell Eapen](https://nuchange.ca)
diff --git a/src/pyomop/__init__.py b/src/pyomop/__init__.py
@@ -4,6 +4,7 @@
 from .engine_factory import CdmEngineFactory
 from .cdm6_tables import metadata
 from .vocabulary import CdmVocabulary
+from .vector import CdmVector
 
 from .cdm6_tables import AttributeDefinition
 from .cdm6_tables import CareSite

diff --git a/src/pyomop/cdm6_tables.py b/src/pyomop/cdm6_tables.py
@@ -1,3 +1,22 @@
+"""
+ Copyright (C) 2020 Bell Eapen
+
+ This file is part of PyOMOP.
+
+ PyOMOP is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ PyOMOP is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with PyOMOP.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
 # coding: utf-8
 from sqlalchemy import BigInteger, Column, Integer, Numeric, String, Text
 from sqlalchemy.ext.declarative import declarative_base
@@ -435,7 +454,9 @@ class Person(Base):
     year_of_birth = Column(Integer, nullable=False)
     month_of_birth = Column(Integer)
     day_of_birth = Column(Integer)
-    time_of_birth = Column(String(10))
+    #time_of_birth = Column(String(10))
+    birth_datetime = Column(String(10))
+    death_datetime = Column(String(10))
     race_concept_id = Column(Integer, nullable=False)
     ethnicity_concept_id = Column(Integer, nullable=False)
     location_id = Column(Integer)

diff --git a/src/pyomop/sqldict.py b/src/pyomop/sqldict.py
@@ -0,0 +1,20 @@
+# https://github.com/OHDSI/QueryLibrary/tree/master/inst/shinyApps/QueryLibrary/queries
+
+# Test
+TEST = "SELECT * from person"
+
+
+# PE03: Number of patients grouped by gender
+PE03 = """
+SELECT
+  person.gender_concept_id,
+  concept.concept_name    AS gender_name,
+  COUNT(person.person_id) AS num_persons
+FROM person
+  JOIN concept ON person.gender_concept_id = concept.concept_id
+GROUP BY person.gender_concept_id, concept.concept_name
+"""
+CDMSQL = {
+    "TEST": TEST,
+    "PE03": PE03
+}
diff --git a/src/pyomop/vector.py b/src/pyomop/vector.py
@@ -0,0 +1,50 @@
+import pandas as pd
+from sqlalchemy.inspection import inspect
+from .sqldict import CDMSQL
+# https://gist.github.com/dermatologist/f436cb461a3290732a27c4dc040229f9
+# Thank you! https://gist.github.com/garaud
+class CdmVector(object):
+
+    def __init__(self, result=None):
+        self._result = result
+        self._df = None
+
+    @property
+    def df(self):
+        if self._df is None:
+            self.create_df()
+        return self._df
+
+    @property
+    def result(self):
+        return self._result
+
+    @result.setter
+    def result(self, value):
+        self._result = value
+
+    def query_to_list(self):
+        """List of result
+        Return: columns name, list of result
+        """
+        result_list = []
+        for obj in self._result:
+            instance = inspect(obj)
+            items = instance.attrs.items()
+            result_list.append([x.value for _,x in items])
+        return instance.attrs.keys(), result_list
+
+    def create_df(self, _names=None):
+        names, data = self.query_to_list()
+        if(_names):
+            names = _names
+        self._df = pd.DataFrame.from_records(data, columns=names)
+
+    def sql_df(self, cdm, sqldict=None, query=None, chunksize=None):
+        if sqldict:
+            query=CDMSQL[sqldict]
+        if chunksize:
+            self._df = pd.read_sql_query(query, cdm.engine)
+        else:
+            self._df = pd.read_sql_query(query, cdm.engine, chunksize)  
+
diff --git a/src/pyomop/vocabulary.py b/src/pyomop/vocabulary.py
@@ -64,26 +64,26 @@ def set_concept(self, concept_code, vocabulary_id=None):
             self._vocabulary_id = 0
             self._concept_id = 0
 
-    def create_vocab(self, folder, sample=False):
-        if sample: # nrows=100
+    def create_vocab(self, folder, sample=10):
+        if sample < 1000: # nrows=sample
             try:
-                df = pd.read_csv(folder + '/DRUG_STRENGTH.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/DRUG_STRENGTH.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('drug_strength', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/CONCEPT.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/CONCEPT.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('concept', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/CONCEPT_RELATIONSHIP.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/CONCEPT_RELATIONSHIP.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('concept_relationship', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/CONCEPT_ANCESTOR.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/CONCEPT_ANCESTOR.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('concept_ancester', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/CONCEPT_SYNONYM.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/CONCEPT_SYNONYM.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('concept_synonym', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/VOCABULARY.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/VOCABULARY.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('vocabulary', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/RELATIONSHIP.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/RELATIONSHIP.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('relationship', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/CONCEPT_CLASS.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/CONCEPT_CLASS.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('concept_class', con=self._engine, if_exists = 'replace')
-                df = pd.read_csv(folder + '/DOMAIN.csv', sep='\t', nrows=100,  error_bad_lines=False, warn_bad_lines=True)
+                df = pd.read_csv(folder + '/DOMAIN.csv', sep='\t', nrows=sample,  error_bad_lines=False, warn_bad_lines=True)
                 df.to_sql('domain', con=self._engine, if_exists = 'replace')
             except ValueError:
                 print("Oops!  Could not write vocabulary")

diff --git a/t_install.py b/t_install.py
@@ -1,13 +1,13 @@
-from pyomop import CdmEngineFactory, CdmVocabulary, Cohort, Vocabulary, metadata
+from pyomop import CdmEngineFactory, CdmVocabulary, CdmVector, Cohort, Vocabulary, metadata
 from sqlalchemy.sql import select
 import datetime
 
 cdm = CdmEngineFactory()  # Creates SQLite database by default
 
 engine = cdm.engine
-# Create Tables 
+## Create Tables if required
 metadata.create_all(engine)
-# Create vocabulary
+## Create vocabulary if required
 vocab = CdmVocabulary(cdm)
 # vocab.create_vocab('/path/to/csv/files')  # Uncomment to load vocabulary csv files
 
@@ -18,10 +18,18 @@
             cohort_start_date=datetime.datetime.now()))
 session.commit()
 
-s = select([Cohort])
-result = session.execute(s)
+result = session.query(Cohort).all()
 for row in result:
     print(row)
-result.close()
-for v in session.query(Vocabulary).order_by(Vocabulary.vocabulary_name):
-    print(v.vocabulary_name)
+
+# Convert result to a pandas dataframe
+vec = CdmVector()
+vec.result = result
+print(vec.df.dtypes)
+
+# Execute a query and convert it to dataframe
+vec.sql_df(cdm, 'TEST') # TEST is defined in sqldict.py
+print(vec.df.dtypes)
+# OR
+vec.sql_df(cdm, query='SELECT * from cohort')
+print(vec.df.dtypes)