Skip to content

Commit

Permalink
airbnb fixes and improvements (#2600)
Browse files Browse the repository at this point in the history
* airbnb_fixes

* airbnb_fixes

* lint

* random_state

* random_state
  • Loading branch information
JKL98ISR committed Jun 14, 2023
1 parent 057d795 commit 62a73ae
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 70 deletions.
101 changes: 36 additions & 65 deletions deepchecks/tabular/datasets/regression/airbnb.py
Expand Up @@ -79,21 +79,19 @@
- Label
- The rental price of the unit
"""
import math
import time
import typing as t
from typing import Tuple

import numpy as np
import pandas as pd

from deepchecks.tabular.dataset import Dataset

__all__ = ['load_data', 'load_pre_calculated_prediction', 'load_pre_calculated_feature_importance']

from numpy import ndarray
__all__ = ['load_data_and_predictions', 'load_pre_calculated_feature_importance']

_TRAIN_DATA_URL = 'https://drive.google.com/uc?export=download&id=1UWkr1BQlyyUkbsW5hHIFTr-x0evZE3Ie'
_TEST_DATA_URL = 'https://drive.google.com/uc?export=download&id=1v_0ZyyycoFfltJ6wj_riGZoXhtPzrnqR'
_TEST_DATA_URL = 'https://drive.google.com/uc?export=download&id=1lfpWVtDktrnsLUzCN1tkRc1jRbguEz3a'
_target = 'price'
_predictions = 'predictions'
_datetime = 'timestamp'
Expand All @@ -103,8 +101,9 @@
_FEATURES = _NUM_FEATURES + _CAT_FEATURES


def load_data(data_format: str = 'Dataset', as_train_test: bool = True, modify_timestamps: bool = True,
data_size: t.Optional[int] = 15000) -> t.Union[t.Tuple, t.Union[Dataset, pd.DataFrame]]:
def load_data_and_predictions(data_format: str = 'Dataset', load_train: bool = True, modify_timestamps: bool = True,
data_size: t.Optional[int] = 15000, random_state: int = 42) \
-> t.Tuple[t.Union[Dataset, pd.DataFrame], np.ndarray]:
"""Load and returns the Airbnb NYC 2019 dataset (regression).
Parameters
Expand All @@ -113,76 +112,48 @@ def load_data(data_format: str = 'Dataset', as_train_test: bool = True, modify_t
Represent the format of the returned value. Can be 'Dataset'|'Dataframe'
'Dataset' will return the data as a Dataset object
'Dataframe' will return the data as a pandas Dataframe object
as_train_test : bool , default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
load_train : bool , default: True
If True, the returned data is the train data. otherwise the test dataset.
modify_timestamps : bool , default: True
If True, the returned data timestamp column will be for the last 30 days.
Otherwise, the data timestamp will be for March 2023.
data_size : t.Optional[int] , default: 15000
The number of samples to return. If None, returns all the data.
random_state : int , default 42
The random state to use for sampling.
Returns
-------
dataset : Union[deepchecks.Dataset, pd.DataFrame]
the data object, corresponding to the data_format attribute.
train_data, test_data : Tuple[Union[deepchecks.Dataset, pd.DataFrame],Union[deepchecks.Dataset, pd.DataFrame]
tuple if as_train_test = True. Tuple of two objects represents the dataset split to train and test sets.
dataset, predictions : Tuple[Union[deepchecks.Dataset, pd.DataFrame], np.ndarray]
Tuple of the deepchecks dataset or dataframe and the predictions.
"""
train = pd.read_csv(_TRAIN_DATA_URL, index_col=0).drop(_predictions, axis=1)
test = pd.read_csv(_TEST_DATA_URL, index_col=0).drop(_predictions, axis=1)
if load_train:
dataset = pd.read_csv(_TRAIN_DATA_URL)
else:
dataset = pd.read_csv(_TEST_DATA_URL)

if data_size is not None:
if data_size < len(train):
train = train.sample(data_size, random_state=42)
if data_size < len(test):
test = test.sample(data_size, random_state=42)

if modify_timestamps:
if data_size < len(dataset):
dataset = dataset.sample(data_size, random_state=random_state)
elif data_size > len(dataset):
dataset = pd.concat([dataset] * math.ceil(data_size / len(dataset)), axis=0, ignore_index=True)
dataset = dataset.sample(data_size, random_state=random_state)
if not load_train:
dataset = dataset.sort_values(_datetime)

if modify_timestamps and not load_train:
current_time = int(time.time())
time_test_start = current_time - 86400 * 30 # Span data for 30 days
test[_datetime] = np.sort((np.random.rand(len(test)) * (current_time - time_test_start)) + time_test_start)
test[_datetime] = test[_datetime].apply(lambda x: pd.Timestamp(x, unit='s'))

if not as_train_test:
dataset = pd.concat([train, test.drop(_datetime, axis=1)], axis=0, ignore_index=True)
if data_format == 'Dataset':
dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES, features=_FEATURES)
return dataset
else:
if data_format == 'Dataset':
train = Dataset(train, label=_target, cat_features=_CAT_FEATURES,
features=_FEATURES)
test = Dataset(test, label=_target, cat_features=_CAT_FEATURES,
datetime_name=_datetime, features=_FEATURES)
return train, test


def load_pre_calculated_prediction(data_size: t.Optional[int] = 15000) -> Tuple[ndarray, ndarray]:
"""Load the pre-calculated prediction for the Airbnb NYC 2019 dataset.
Parameters
----------
data_size : t.Optional[int] , default: 15000
The number of samples to return. If None, returns all the data.
Returns
-------
predictions : Tuple(np.ndarray, np.ndarray)
The first element is the pre-calculated prediction for the train set.
The second element is the pre-calculated prediction for the test set.
"""
usable_columns = [_target, _predictions]
train = pd.read_csv(_TRAIN_DATA_URL, usecols=usable_columns)
test = pd.read_csv(_TEST_DATA_URL, usecols=usable_columns)
if data_size is not None:
if data_size < len(train):
train = train.sample(data_size, random_state=42)
if data_size < len(test):
test = test.sample(data_size, random_state=42)
return np.asarray(train[_predictions]), np.asarray(test[_predictions])
dataset[_datetime] = np.sort(
(np.random.rand(len(dataset)) * (current_time - time_test_start)) + time_test_start
)
dataset[_datetime] = dataset[_datetime].apply(lambda x: pd.Timestamp(x, unit='s'))

predictions = np.asarray(dataset[_predictions])
dataset.drop(_predictions, axis=1, inplace=True)
if data_format == 'Dataset':
dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES,
features=_FEATURES)
return dataset, predictions


def load_pre_calculated_feature_importance() -> pd.Series:
Expand Down
13 changes: 8 additions & 5 deletions tests/tabular/test_datasets.py
Expand Up @@ -76,8 +76,11 @@ def test_model_predict_on_wine_quality():
assert_dataset_module(wine_quality)

def test_sampling_airbnb():
train_sampled , _ = airbnb.load_data(data_format='DataFrame')
pred_full, _ = airbnb.load_pre_calculated_prediction(data_size=None)
pred_sampled, _ = airbnb.load_pre_calculated_prediction()

assert list(pred_full[train_sampled.index]) == list(pred_sampled)
train_sampled , train_pred_sampled = airbnb.load_data_and_predictions(data_format='DataFrame')
assert len(train_sampled) == len(train_pred_sampled) == 15_000
train_inflated , train_pred_inflated = airbnb.load_data_and_predictions(data_format='DataFrame', data_size=100_000)
assert len(train_inflated) == len(train_pred_inflated) == 100_000
test_sampled , test_pred_sampled = airbnb.load_data_and_predictions(load_train=False)
assert len(test_sampled) == len(test_pred_sampled) == 15_000
test_inflated , test_pred_inflated = airbnb.load_data_and_predictions(load_train=False, data_size=100_000)
assert len(test_inflated) == len(test_pred_inflated) == 100_000

0 comments on commit 62a73ae

Please sign in to comment.