Skip to content

Commit

Permalink
Implement DataFrame.insert (#1983)
Browse files Browse the repository at this point in the history
ref #1929

Insert column into DataFrame at a specified location.

```
        >>> kdf = ks.DataFrame([1, 2, 3])
        >>> kdf.insert(0, 'x', 4)
        >>> kdf.sort_index()
           x  0
        0  4  1
        1  4  2
        2  4  3

        >>> from databricks.koalas.config import set_option, reset_option
        >>> set_option("compute.ops_on_diff_frames", True)

        >>> kdf.insert(1, 'y', [5, 6, 7])
        >>> kdf.sort_index()
           x  y  0
        0  4  5  1
        1  4  6  2
        2  4  7  3

        >>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
        >>> kdf.sort_index()
           x  y   z  0
        0  4  5   8  1
        1  4  6   9  2
        2  4  7  10  3

        >>> reset_option("compute.ops_on_diff_frames")
```
  • Loading branch information
xinrong-meng committed Jan 20, 2021
1 parent c38c96f commit 8803344
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 1 deletion.
82 changes: 82 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
spark_type_to_pandas_dtype,
DataFrameType,
SeriesType,
Scalar,
)
from databricks.koalas.plot import KoalasPlotAccessor

Expand Down Expand Up @@ -3711,6 +3712,87 @@ def notnull(self) -> "DataFrame":

notna = notnull

def insert(
self,
loc: int,
column,
value: Union[Scalar, "Series", Iterable],
allow_duplicates: bool = False,
) -> None:
"""
Insert column into DataFrame at specified location.
Raises a ValueError if `column` is already contained in the DataFrame,
unless `allow_duplicates` is set to True.
Parameters
----------
loc : int
Insertion index. Must verify 0 <= loc <= len(columns).
column : str, number, or hashable object
Label of the inserted column.
value : int, Series, or array-like
allow_duplicates : bool, optional
Examples
--------
>>> kdf = ks.DataFrame([1, 2, 3])
>>> kdf.sort_index()
0
0 1
1 2
2 3
>>> kdf.insert(0, 'x', 4)
>>> kdf.sort_index()
x 0
0 4 1
1 4 2
2 4 3
>>> from databricks.koalas.config import set_option, reset_option
>>> set_option("compute.ops_on_diff_frames", True)
>>> kdf.insert(1, 'y', [5, 6, 7])
>>> kdf.sort_index()
x y 0
0 4 5 1
1 4 6 2
2 4 7 3
>>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
>>> kdf.sort_index()
x y z 0
0 4 5 8 1
1 4 6 9 2
2 4 7 10 3
>>> reset_option("compute.ops_on_diff_frames")
"""
if not isinstance(loc, int):
raise TypeError("loc must be int")

assert 0 <= loc <= len(self.columns)
assert allow_duplicates is False

if not is_name_like_value(column):
raise ValueError(
'"column" should be a scalar value or tuple that contains scalar values'
)

if is_name_like_tuple(column):
if len(column) != len(self.columns.levels):
# To be consistent with pandas
raise ValueError('"column" must have length equal to number of column levels.')

if column in self.columns:
raise ValueError("cannot insert %s, already exists" % column)

kdf = self.copy()
kdf[column] = value
columns = kdf.columns[:-1].insert(loc, kdf.columns[-1])
kdf = kdf[columns]
self._update_internal_frame(kdf._internal)

# TODO: add frep and axis parameter
def shift(self, periods=1, fill_value=None) -> "DataFrame":
"""
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrame(object):
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
insert = _unsupported_function("insert")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
Expand Down
57 changes: 57 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,63 @@ def test_dataframe(self):
index_cols = pdf.columns[column_mask]
self.assert_eq(kdf[index_cols], pdf[index_cols])

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

kser = ks.Series([4, 5, 6])
self.assertRaises(ValueError, lambda: kdf.insert(0, "y", kser))
self.assertRaisesRegex(
ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
)
self.assertRaisesRegex(
ValueError,
'"column" should be a scalar value or tuple that contains scalar values',
lambda: kdf.insert(0, list("abc"), kser),
)
self.assertRaises(ValueError, lambda: kdf.insert(0, "e", [7, 8, 9, 10]))
self.assertRaises(ValueError, lambda: kdf.insert(0, "f", ks.Series([7, 8])))
self.assertRaises(AssertionError, lambda: kdf.insert(100, "y", kser))
self.assertRaises(AssertionError, lambda: kdf.insert(1, "y", kser, allow_duplicates=True))

#
# DataFrame with MultiIndex as columns
#
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

self.assertRaisesRegex(
ValueError, "cannot insert d, already exists", lambda: kdf.insert(4, "d", 11)
)
self.assertRaisesRegex(
ValueError,
'"column" must have length equal to number of column levels.',
lambda: kdf.insert(4, ("e",), 11),
)

def test_inplace(self):
pdf, kdf = self.df_pair

Expand Down
42 changes: 42 additions & 0 deletions databricks/koalas/tests/test_ops_on_diff_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,48 @@ def test_combine_first(self):
kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
)

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Index different from inserting Series'
#
pdf = pd.DataFrame([1, 2, 3], index=[10, 20, 30])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Multi-index columns
#
pdf = pd.DataFrame({("x", "a"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)
kdf.insert(0, "a", kser)
pdf.insert(0, "a", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())
kdf.insert(0, ("b", "c", ""), kser)
pdf.insert(0, ("b", "c", ""), pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

def test_compare(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ Combining / joining / merging
DataFrame.merge
DataFrame.join
DataFrame.update
DataFrame.insert

Time series-related
-------------------
Expand Down

0 comments on commit 8803344

Please sign in to comment.