Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow queries with parameters to avoid SQL injection #75

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 17 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pandasql
========

`pandasql` allows you to query `pandas` DataFrames using SQL syntax. It works
similarly to `sqldf` in R. `pandasql` seeks to provide a more familiar way of
`pandasql` allows you to query `pandas` DataFrames using SQL syntax. It works
similarly to `sqldf` in R. `pandasql` seeks to provide a more familiar way of
manipulating and cleaning data for people new to Python or `pandas`.

#### Installation
Expand All @@ -15,15 +15,15 @@ The main function used in pandasql is `sqldf`. `sqldf` accepts 2 parametrs
- a sql query string
- a set of session/environment variables (`locals()` or `globals()`)

Specifying `locals()` or `globals()` can get tedious. You can define a short
Specifying `locals()` or `globals()` can get tedious. You can define a short
helper function to fix this.

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
pysqldf = lambda q, params=None: sqldf(q, locals(), params=params)

#### Querying
`pandasql` uses [SQLite syntax](http://www.sqlite.org/lang.html). Any `pandas`
dataframes will be automatically detected by `pandasql`. You can query them as
`pandasql` uses [SQLite syntax](http://www.sqlite.org/lang.html). Any `pandas`
dataframes will be automatically detected by `pandasql`. You can query them as
you would any regular SQL table.


Expand Down Expand Up @@ -76,9 +76,19 @@ joins and aggregations are also supported
4 1948 8766
```

queries with parameters are supported
```
>>> iris = load_iris()
>>> iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
>>> print pyqldf("SELECT DISTINCT species FROM iris_df WHERE species <> ? ", params=('versicolor',) )
species
0 setosa
1 virginica
```

More information and code samples available in the [examples](https://github.com/yhat/pandasql/blob/master/examples/demo.py)
folder or on [our blog](http://blog.yhathq.com/posts/pandasql-sql-for-pandas-dataframes.html).



[![Analytics](https://ga-beacon.appspot.com/UA-46996803-1/pandasql/README.md)](https://github.com/yhat/pandasql)
[![Analytics](https://ga-beacon.appspot.com/UA-46996803-1/pandasql/README.md)](https://github.com/yhat/pandasql)
25 changes: 25 additions & 0 deletions examples/demo2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os, time
import pandas as pd
from pandasql import sqldf

# dummy DataFrame
data = [ [ "abc", 123, True, "C:\\temp" ], [ "d'ef", -45.6, False, "C:\\windows" ], [ "xyz", 0.89, 0, "/usr/" ] ]
df = pd.DataFrame(data, columns = [ "id", "n", "b", "f" ])


# define 'pysqldf' as per pandasql documentation, with extra params and user-defined-functions registration

def my_sqlite_connect_listener( dbapi_con, con_record ):
# registering a few extra functions to SQLite
dbapi_con.create_function( 'IIF', 3, lambda b, t, f : t if b else f )
dbapi_con.create_function( 'CUBE', 1, lambda x : x*x*x )
dbapi_con.create_function( 'FileExists', 1, lambda f : os.path.exists(f) )
dbapi_con.create_function( 'FileModificationDate', 1, lambda f : time.ctime(os.path.getmtime(f)) if os.path.exists(f) else None)

pysqldf = lambda q, params=None: sqldf(q, globals(), params=params, sqlite_connect_listener=my_sqlite_connect_listener)


# demo of request using the extra functions
print(pysqldf("select n, IIF(n<0, 'n is negative', 'n is positive') from df where id<>?", params = ('abc', )))
print(pysqldf("select CUBE(2), CUBE(3), CUBE(4), CUBE(5)"))
print(pysqldf("select f, FileExists(f), FileModificationDate(f) from df"))
12 changes: 7 additions & 5 deletions pandasql/sqldf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class PandaSQLException(Exception):


class PandaSQL:
def __init__(self, db_uri='sqlite:///:memory:', persist=False):
def __init__(self, db_uri='sqlite:///:memory:', persist=False, sqlite_connect_listener=None):
"""
Initialize with a specific database.

Expand All @@ -26,6 +26,8 @@ def __init__(self, db_uri='sqlite:///:memory:', persist=False):

if self.engine.name == 'sqlite':
listen(self.engine, 'connect', self._set_text_factory)
if self.engine.name == 'sqlite' and sqlite_connect_listener is not None:
listen(self.engine, 'connect', sqlite_connect_listener)

if self.engine.name not in ('sqlite', 'postgresql'):
raise PandaSQLException('Currently only sqlite and postgresql are supported.')
Expand All @@ -36,7 +38,7 @@ def __init__(self, db_uri='sqlite:///:memory:', persist=False):
self._conn = self.engine.connect()
self._init_connection(self._conn)

def __call__(self, query, env=None):
def __call__(self, query, env=None, params=None):
"""
Execute the SQL query.
Automatically creates tables mentioned in the query from dataframes before executing.
Expand All @@ -61,7 +63,7 @@ def __call__(self, query, env=None):
write_table(env[table_name], table_name, conn)

try:
result = read_sql(query, conn)
result = read_sql(query, conn, params=params)
except DatabaseError as ex:
raise PandaSQLException(ex)
except ResourceClosedError:
Expand Down Expand Up @@ -126,7 +128,7 @@ def write_table(df, tablename, conn):
index=not any(name is None for name in df.index.names)) # load index into db if all levels are named


def sqldf(query, env=None, db_uri='sqlite:///:memory:'):
def sqldf(query, env=None, db_uri='sqlite:///:memory:', persist=False, sqlite_connect_listener=None, params=None):
"""
Query pandas data frames using sql syntax
This function is meant for backward compatibility only. New users are encouraged to use the PandaSQL class.
Expand Down Expand Up @@ -158,4 +160,4 @@ def sqldf(query, env=None, db_uri='sqlite:///:memory:'):
>>> sqldf("select * from df;", locals())
>>> sqldf("select avg(x) from df;", locals())
"""
return PandaSQL(db_uri)(query, env)
return PandaSQL(db_uri, persist, sqlite_connect_listener)(query, env, params)