Pandas

Created
Tags

Introduction

import pandas as pd

df1 = pd.DataFrame({
  'Product ID': [1, 2, 3, 4],
  'Product Name': ['t-shirt', 't-shirt', 'skirt', 'skirt'],
  'Color': ['blue', 'green', 'red', 'black']
})

print(df1)


df2 = pd.DataFrame([
    ['John Smith', '123 Main St.', 34],
    ['Jane Doe', '456 Maple Ave.', 28],
    ['Joe Schmo', '789 Broadway', 51]
    ],
    columns=['name', 'address', 'age'])

CSV

df = pd.read_csv('my-csv-file.csv')
df.to_csv('new-csv-file.csv')


clinic_north = df.clinic_north # select column
print(type(clinic_north))
print(type(df))

clinic_north_south = df[['clinic_north', 'clinic_south']] # select multiple columns

march = df.iloc[2] # select row 2 zero-indexed
april_may_june = df.iloc[-3:]
example = orders.iloc[1:4]

df2 = df.loc[[1, 3, 5]]

january = df[df.month == 'January'] # checking-based most
# most logical ('|' as 'or' and & as 'and') and relational (<, !=, etc) operators allowed
march_april = df[(df.month == 'March') | (df.month == 'April')]

january_february_march = df[df.month.isin(['January', 'February', 'March'])]

df3 = df2.reset_index(drop = True, inplace = True)

Manipulations

# Adding new columns 
df['Is taxed?'] = 'Yes' # is same as 
df['Is taxed?'] = ['Yes', 'Yes', 'Yes'] # consider presence of three rows in data set

df['Lowercase Name'] = df.Name.apply(lower)

get_last_name = lambda x: x.split()[-1]
df['last_name'] = df.name.apply(get_last_name)

total_earned = lambda row: (row['hours_worked'] - 40) * row['hourly_wage'] * 1.5
										          + row['hourly_wage'] * 40 
							             if row['hours_worked'] > 40 else
								               row['hours_worked'] * row['hourly_wage']
df['total_earned'] = df.apply(total_earned, axis = 1) # axis is needed to specify row

Aggregates

mean	             # Average of all values in column
std	               # Standard deviation
median	           # Median
max	               # Maximum value in column
min	               # Minimum value in column
count	             # Number of values in column
nunique	           # Number of unique values in column
unique	           # List of unique values in column
np.percentile(<input>, <...<=reqd_percentile_value>...>)

df.groupby('column1').column2.measurement() # syntax
# .apply() .reset_index() and .rename() can be used as well in association

# example
high_earners = df.groupby('category').wage
    .apply(lambda x: np.percentile(x, 75))
    .reset_index()
df.groupby(['Location', 'Day of Week'])['Total Sales'].mean().reset_index()


# First use the groupby statement:
unpivoted = df.groupby(['Location', 'Day of Week'])['Total Sales'].mean().reset_index()
# Now pivot the table
pivoted = unpivoted.pivot(
    columns='Day of Week',
    index='Location',
    values='Total Sales')

Multiple Tables

big_df = orders.merge(customers).merge(products)
df = pd.merge(orders, customers)
df2 = pd.merge(
    orders,
    customers.rename(columns={'id': 'customer_id'}))

# Merges other than Inner Merge
pd.merge(company_a, company_b, how='outer') 
pd.merge(company_a, company_b, how='left')
pd.merge(company_a, company_b, how='right')

pd.concat([df1, df2, df2, ...])