df = pd.read_csv('my-csv-file.csv')
df.to_csv('new-csv-file.csv')
clinic_north = df.clinic_north # select column
print(type(clinic_north))
print(type(df))
clinic_north_south = df[['clinic_north', 'clinic_south']] # select multiple columns
march = df.iloc[2] # select row 2 zero-indexed
april_may_june = df.iloc[-3:]
example = orders.iloc[1:4]
df2 = df.loc[[1, 3, 5]]
january = df[df.month == 'January'] # checking-based most
# most logical ('|' as 'or' and & as 'and') and relational (<, !=, etc) operators allowed
march_april = df[(df.month == 'March') | (df.month == 'April')]
january_february_march = df[df.month.isin(['January', 'February', 'March'])]
df3 = df2.reset_index(drop = True, inplace = True)
Manipulations
# Adding new columns
df['Is taxed?'] = 'Yes' # is same as
df['Is taxed?'] = ['Yes', 'Yes', 'Yes'] # consider presence of three rows in data set
df['Lowercase Name'] = df.Name.apply(lower)
get_last_name = lambda x: x.split()[-1]
df['last_name'] = df.name.apply(get_last_name)
total_earned = lambda row: (row['hours_worked'] - 40) * row['hourly_wage'] * 1.5
+ row['hourly_wage'] * 40
if row['hours_worked'] > 40 else
row['hours_worked'] * row['hourly_wage']
df['total_earned'] = df.apply(total_earned, axis = 1) # axis is needed to specify row
Aggregates
mean # Average of all values in column
std # Standard deviation
median # Median
max # Maximum value in column
min # Minimum value in column
count # Number of values in column
nunique # Number of unique values in column
unique # List of unique values in column
np.percentile(<input>, <...<=reqd_percentile_value>...>)
df.groupby('column1').column2.measurement() # syntax
# .apply() .reset_index() and .rename() can be used as well in association
# example
high_earners = df.groupby('category').wage
.apply(lambda x: np.percentile(x, 75))
.reset_index()
df.groupby(['Location', 'Day of Week'])['Total Sales'].mean().reset_index()
# First use the groupby statement:
unpivoted = df.groupby(['Location', 'Day of Week'])['Total Sales'].mean().reset_index()
# Now pivot the table
pivoted = unpivoted.pivot(
columns='Day of Week',
index='Location',
values='Total Sales')