/
vectorize_conditions.py
140 lines (106 loc) · 3.28 KB
/
vectorize_conditions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 29 15:32:22 2023
@author: cghiaus
Problem:
Apply a logical condition across every row of a DataFrame.
Assign the result to a new column.
Solutions:
Level 1 Looping:
Define a function with the logic for rewarding for each row (i.e., person).
Loop over each row of the DataFrame to apply the condition and
obtain the value of a cell.
Level 2 Vectorization:
Apply the logical conditions to the whole DataFrame.
Assign the default values to the column.
Assign the Series of calculated values with condition.
Based on:
"Make Your Pandas Code Lightning Fast"
https://youtu.be/SAFmrTnEHLg
Example
=======
The problem:
Given a population for which each person has the characteristics:
age, time_in_bed, percent_sleeping, favorite_food, hate_food
create a new column with their favorite food or hate food as a reward.
Reward logic:
IF (they were in bed for more than 1 hour
AND if they slept for more than 10 %)
OR
if they are over 90 years old,
THEN
give them their favorite food.
ELSE
give them their hate food.
"""
import numpy as np
import pandas as pd
def generate_data(size=10_000):
"""
Generates DataFrame with random data:
index, age, time_in_bed, percent_sleeping, favorite_food, hate_food
Parameters
----------
size : int > 0
n° of samples in the DataFrame.
Returns
-------
None.
"""
df = pd.DataFrame()
df['age'] = np.random.randint(0, 100, size)
df['time_in_bed'] = np.random.randint(0, 9, size)
df['percent_sleeping'] = np.random.rand(size)
df['favorite_food'] = np.random.choice(
['+pizza', '+tacos', '+ice-cream'], size)
df['hate_food'] = np.random.choice(
['-brocolli', '-potato', '-eggs'], size)
return(df)
def reward(person):
"""
Implements the logical condition for each row.
Returns the value tu be assigned to the column in df.
Parameters
----------
person : Series
row for a person in the DataFrame.
Returns
-------
person['favorite_food'] OR person['hate_food'] : value from row
'favorite_food' or 'hate_food' of df
"""
condition = ((person['time_in_bed'] > 1
) and (person['percent_sleeping'] > 0.1)
) or person['age'] >= 90
if condition:
return person['favorite_food']
else:
return person['hate_food']
size = 10 # n° of samples in the DataFrame
df = generate_data(size)
"""
LEVEL 1: Looping
****************
Loop over each row of the df and apply the condition given in the function.
For each row, assign the result to a cell of the df.
"""
df_loop = df
for index, person in df_loop.iterrows():
df_loop.loc[index, 'reward'] = reward(person)
"""
LEVEL 2: Vectorization
*************
Instead of looping on each row,
apply the logical conditions to the whole DataFrame.
"""
df_vector = df
condition = ((df_vector['time_in_bed'] > 1
) & (df_vector['percent_sleeping'] > 0.1)
) | (df_vector['age'] >= 90)
df_vector['reward'] = df_vector['hate_food']
df_vector.loc[condition, 'reward'] = df_vector['favorite_food']
"""
Check if the two df are equal
"""
print("Are the two DataFrames equal? ", df_loop.equals(df_vector))