/
dask_io.py
123 lines (100 loc) · 4.13 KB
/
dask_io.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Methods to allow dask.DataFrame (deprecated).
This module and all its submodules are deprecated. See
[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
for migration instructions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from tensorflow.python.util.deprecation import deprecated
try:
# pylint: disable=g-import-not-at-top
import dask.dataframe as dd
allowed_classes = (dd.Series, dd.DataFrame)
HAS_DASK = True
except ImportError:
HAS_DASK = False
def _add_to_index(df, start):
"""New dask.dataframe with values added to index of each subdataframe."""
df = df.copy()
df.index += start
return df
def _get_divisions(df):
"""Number of rows in each sub-dataframe."""
lengths = df.map_partitions(len).compute()
divisions = np.cumsum(lengths).tolist()
divisions.insert(0, 0)
return divisions
def _construct_dask_df_with_divisions(df):
"""Construct the new task graph and make a new dask.dataframe around it."""
divisions = _get_divisions(df)
# pylint: disable=protected-access
name = 'csv-index' + df._name
dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i])
for i in range(df.npartitions)}
# pylint: enable=protected-access
from toolz import merge # pylint: disable=g-import-not-at-top
if isinstance(df, dd.DataFrame):
return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
elif isinstance(df, dd.Series):
return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
@deprecated(None, 'Please feed input to tf.data to support dask.')
def extract_dask_data(data):
"""Extract data from dask.Series or dask.DataFrame for predictors.
Given a distributed dask.DataFrame or dask.Series containing columns or names
for one or more predictors, this operation returns a single dask.DataFrame or
dask.Series that can be iterated over.
Args:
data: A distributed dask.DataFrame or dask.Series.
Returns:
A dask.DataFrame or dask.Series that can be iterated over.
If the supplied argument is neither a dask.DataFrame nor a dask.Series this
operation returns it without modification.
"""
if isinstance(data, allowed_classes):
return _construct_dask_df_with_divisions(data)
else:
return data
@deprecated(None, 'Please feed input to tf.data to support dask.')
def extract_dask_labels(labels):
"""Extract data from dask.Series or dask.DataFrame for labels.
Given a distributed dask.DataFrame or dask.Series containing exactly one
column or name, this operation returns a single dask.DataFrame or dask.Series
that can be iterated over.
Args:
labels: A distributed dask.DataFrame or dask.Series with exactly one
column or name.
Returns:
A dask.DataFrame or dask.Series that can be iterated over.
If the supplied argument is neither a dask.DataFrame nor a dask.Series this
operation returns it without modification.
Raises:
ValueError: If the supplied dask.DataFrame contains more than one
column or the supplied dask.Series contains more than
one name.
"""
if isinstance(labels, dd.DataFrame):
ncol = labels.columns
elif isinstance(labels, dd.Series):
ncol = labels.name
if isinstance(labels, allowed_classes):
if len(ncol) > 1:
raise ValueError('Only one column for labels is allowed.')
return _construct_dask_df_with_divisions(labels)
else:
return labels