# Import libraries
import pandas as pd
import numpy as np


import seaborn as sns
tips_data = sns.load_dataset('tips')
print(tips_data.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


tips_data['day'].unique()

[Sun, Sat, Thur, Fri]
Categories (4, object): [Sun, Sat, Thur, Fri]


tips_data['day'].unique().tolist()

['Sun', 'Sat', 'Thur', 'Fri']


indexed_tip = tips_data.set_index('day')
indexed_tip.head()


indexed_tip.index.unique()

CategoricalIndex(['Sun', 'Sat', 'Thur', 'Fri'], categories=['Sun', 'Sat', 'Thur', 'Fri'], ordered=False, name='day', dtype='category')


# Count of unique values in employee
tips_data['day'].nunique()

4


# Count of unique values in skills
tips_data['time'].nunique()

2


tips_data['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64


days_df = tips_data['day'].value_counts().reset_index()
days_df


pd.get_dummies(tips_data, columns=['day'])


pd.get_dummies(tips_data, columns=['day', 'smoker', 'sex'], prefix=['weekday', 'smokes', 'gender'])


# Bin data into 5 equal sized buckets
pd.cut(tips_data['total_bill'], bins=5)

0      (12.618, 22.166]
1       (3.022, 12.618]
2      (12.618, 22.166]
3      (22.166, 31.714]
4      (22.166, 31.714]
             ...       
239    (22.166, 31.714]
240    (22.166, 31.714]
241    (22.166, 31.714]
242    (12.618, 22.166]
243    (12.618, 22.166]
Name: total_bill, Length: 244, dtype: category
Categories (5, interval[float64]): [(3.022, 12.618] < (12.618, 22.166] < (22.166, 31.714] < (31.714, 41.262] < (41.262, 50.81]]


# Bin data by specifying bin edges
bill_cat = pd.cut(tips_data['total_bill'], bins=[0, 31.714, 50.81])
bill_cat

0      (0.0, 31.714]
1      (0.0, 31.714]
2      (0.0, 31.714]
3      (0.0, 31.714]
4      (0.0, 31.714]
           ...      
239    (0.0, 31.714]
240    (0.0, 31.714]
241    (0.0, 31.714]
242    (0.0, 31.714]
243    (0.0, 31.714]
Name: total_bill, Length: 244, dtype: category
Categories (2, interval[float64]): [(0.0, 31.714] < (31.714, 50.81]]


# Value count for each category
bill_cat.value_counts()

(0.0, 31.714]      218
(31.714, 50.81]     26
Name: total_bill, dtype: int64


# Apply Group By
grp1 = tips_data.groupby('sex')
grp1

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff1186b91c0>


# Compute mean
grp1.mean()


# Aggregation using multiple keys
tips_data.groupby(['smoker', 'day']).mean()


# Sum operation
tips_data.groupby(['smoker', 'day']).sum()


# Median operation
tips_data.groupby(['smoker', 'day']).median()


# Standard Deviation operation
tips_data.groupby(['smoker', 'day']).std()


tips_data.groupby(['smoker', 'day']).aggregate(['min', np.median, max])


tips_data.groupby(['smoker', 'day']).aggregate({'total_bill':'min', 'tip':'max'})


tips_data.groupby(['smoker', 'day']).aggregate({'total_bill':['min','max','count'], 'tip':'max'})


tips_data.groupby(['smoker'])['tip'].mean()

smoker
Yes    3.008710
No     2.991854
Name: tip, dtype: float64


tips_data.groupby(['smoker','day'])['tip'].median()

smoker  day 
Yes     Thur    2.560
        Fri     2.500
        Sat     2.690
        Sun     3.500
No      Thur    2.180
        Fri     3.125
        Sat     2.750
        Sun     3.020
Name: tip, dtype: float64


# Get Data
titanic_df = sns.load_dataset('titanic')
titanic_df.head()


titanic_df.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()


titanic_df.pivot_table(values='survived', index='sex', columns='class')


# Create bins for Age
age = pd.cut(titanic_df['age'], bins=[0,18,80])
age

0      (18.0, 80.0]
1      (18.0, 80.0]
2      (18.0, 80.0]
3      (18.0, 80.0]
4      (18.0, 80.0]
           ...     
886    (18.0, 80.0]
887    (18.0, 80.0]
888             NaN
889    (18.0, 80.0]
890    (18.0, 80.0]
Name: age, Length: 891, dtype: category
Categories (2, interval[int64]): [(0, 18] < (18, 80]]


# Group data
titanic_df.pivot_table(values='survived', index=['sex', age], columns='class')


# Create bins for fare
fare = pd.qcut(titanic_df['fare'], q=2)


# Group data
titanic_df.pivot_table(values='survived', index=['sex', age], columns=[fare, 'class'])


titanic_df.pivot_table(index='sex', columns='class',
                      aggfunc={'survived':'sum', 'fare': 'mean'})


titanic_df.pivot_table(values='survived', index='sex', columns='class',
                      aggfunc='sum', margins=True)


tips_data.describe()


tips_data.groupby(['smoker','day'])['total_bill'].describe()


# Create Data
df2 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])
print('df2 is: \n', df2)

df3 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])
print('\ndf3 is: \n', df3)

df4 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['one', 'two'])
print('\ndf4 is: \n', df4)

df2 is: 
    one  two
a    0    1
b    2    3
c    4    5

df3 is: 
    three  four
a      5     6
c      7     8

df4 is: 
    one  two
a    5    6
c    7    8


pd.concat([df2,df4])


pd.concat([df2,df4], ignore_index=True)


pd.concat([df2,df3])


pd.concat([df2,df3], axis='columns')


df2.append(df4)


df2.append(df3)


# Create data
dept_df = pd.DataFrame({'employee': ['John', 'Jake', 'Jane', 'Suzi', 'Chad'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'Management', 'Marketing']})
print('dept_df is: \n', dept_df)

skills_df = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'Management', 'Management', 'Operations'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization', 'SAP']})
print('\nskills_df is: \n', skills_df)

dept_df is: 
   employee        group
0     John   Accounting
1     Jake  Engineering
2     Jane  Engineering
3     Suzi   Management
4     Chad    Marketing

skills_df is: 
          group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4   Management  spreadsheets
5   Management  organization
6   Operations           SAP


# Apply merge
pd.merge(dept_df,skills_df)


pd.merge(dept_df,skills_df, on='group')


# Create data
emp_df = pd.DataFrame({'name': ['John', 'Jake', 'Jane', 'Suzi', 'Chad'],
                    'salary': [70000, 80000, 120000, 65000, 90000]})
emp_df


# Merge
pd.merge(dept_df, emp_df, left_on='employee', right_on='name')


pd.merge(dept_df, emp_df, left_on='employee', right_on='name').drop('name', axis='columns')


pd.merge(dept_df,skills_df, on='group', how='inner')


pd.merge(dept_df,skills_df, on='group', how='left')


pd.merge(dept_df,skills_df, on='group', how='right')


out_df = pd.merge(dept_df,skills_df, on='group', how='outer')
out_df


# Create data
dept_dfa = dept_df.set_index('group')
print('dept_dfa is \n', dept_dfa)
skills_dfa = skills_df.set_index('group')
print('\nskills_dfa is \n', skills_dfa)

dept_dfa is 
             employee
group               
Accounting      John
Engineering     Jake
Engineering     Jane
Management      Suzi
Marketing       Chad

skills_dfa is 
                    skills
group                    
Accounting           math
Accounting   spreadsheets
Engineering        coding
Engineering         linux
Management   spreadsheets
Management   organization
Operations            SAP


# Merge
pd.merge(dept_dfa, skills_dfa, left_index=True, right_index=True)


dept_dfa.join(skills_dfa, how='inner')


dept_dfa.join(skills_dfa, how='outer')


# Create Data
index = [('CA', 2005), ('CA', 2015),
         ('NY', 2005), ('NY', 2015),
         ('TX', 2005), ('TX', 2015)]
population = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop_series = pd.Series(population, index=index)
pop_series

(CA, 2005)    33871648
(CA, 2015)    37253956
(NY, 2005)    18976457
(NY, 2015)    19378102
(TX, 2005)    20851820
(TX, 2015)    25145561
dtype: int64


new_index = pd.MultiIndex.from_tuples(index)
new_index.levels

FrozenList([['CA', 'NY', 'TX'], [2005, 2015]])


pop_series = pop_series.reindex(new_index)
pop_series

CA  2005    33871648
    2015    37253956
NY  2005    18976457
    2015    19378102
TX  2005    20851820
    2015    25145561
dtype: int64


pop_series['CA']

2005    33871648
2015    37253956
dtype: int64


pop_series['NY':'TX']

NY  2005    18976457
    2015    19378102
TX  2005    20851820
    2015    25145561
dtype: int64


pop_series[['CA','TX']]

CA  2005    33871648
    2015    37253956
TX  2005    20851820
    2015    25145561
dtype: int64


pop_series[:,2015]

CA    37253956
NY    19378102
TX    25145561
dtype: int64


# Unstack
pop_df = pop_series.unstack()
pop_df


# Stack
pop_df.stack()

CA  2005    33871648
    2015    37253956
NY  2005    18976457
    2015    19378102
TX  2005    20851820
    2015    25145561
dtype: int64


pop_series.unstack(level=0)


pop_series.unstack(level=1)


# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2019, 2018], [2, 1]],
                                   names=['year', 'quarter'])
columns = pd.MultiIndex.from_product([['John', 'Jane', 'Ben'], ['Product A', 'Product B']],
                                     names=['sales person', 'product'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 55

# create the DataFrame
sales_data = pd.DataFrame(data, index=index, columns=columns)
sales_data


sales_data['John','Product A']

year  quarter
2019  2          49.0
      1          51.0
2018  2          43.0
      1          39.0
Name: (John, Product A), dtype: float64


sales_data[['John','Jane']]


sales_data.iloc[:2, :4]


sales_data.loc[2019,'Jane']


sales_data.loc[2019,('Jane','Product B')]

quarter
2    55.4
1    53.5
Name: (Jane, Product B), dtype: float64


sales_data.loc[(2019,2), ('Jane','Product B')]

55.4


sales_data.sort_index()


sales_data.sort_index(level=1)


sales_data.sort_index(axis='columns')


sales_data.sort_index(axis='columns', level=1)


sales_data.sort_values(by=['quarter','year'])


sales_data.sort_values(by=('Ben','Product A'))


sales_data.sort_values(by=[('Jane','Product A'), ('quarter')])


# Get data
sales_data


sales_data.sum()

sales person  product  
John          Product A    182.0
              Product B    222.5
Jane          Product A    226.0
              Product B    221.1
Ben           Product A    200.0
              Product B    217.6
dtype: float64


sales_data.sum(axis='columns')

year  quarter
2019  2          317.2
      1          338.1
2018  2          306.8
      1          307.1
dtype: float64


sales_data.sum(level='quarter')


yearly_avg = sales_data.mean(level='year')
yearly_avg


yearly_avg.mean(axis=1, level='product')

		total_bill	tip	size
smoker	day
Yes	Thur	19.190588	3.030000	2.352941
	Fri	16.813333	2.714000	2.066667
	Sat	21.276667	2.875476	2.476190
	Sun	24.120000	3.516842	2.578947
No	Thur	17.113111	2.673778	2.488889
	Fri	18.420000	2.812500	2.250000
	Sat	19.661778	3.102889	2.555556
	Sun	20.506667	3.167895	2.929825

		total_bill	tip	size
smoker	day
Yes	Thur	326.24	51.51	40
	Fri	252.20	40.71	31
	Sat	893.62	120.77	104
	Sun	458.28	66.82	49
No	Thur	770.09	120.32	112
	Fri	73.68	11.25	9
	Sat	884.78	139.63	115
	Sun	1168.88	180.57	167

		total_bill	tip	size
smoker	day
Yes	Thur	8.355149	1.113491	0.701888
	Fri	9.086388	1.077668	0.593617
	Sat	10.069138	1.630580	0.862161
	Sun	10.442511	1.261151	0.901591
No	Thur	7.721728	1.282964	1.179796
	Fri	5.059282	0.898494	0.500000
	Sat	8.939181	1.642088	0.784960
	Sun	8.130189	1.224785	1.032674

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

class	First	Second	Third
sex
female	0.968085	0.921053	0.500000
male	0.368852	0.157407	0.135447

Part 4 - Productivity with Pandas¶

Table of Contents

Working with Categorical Data¶

Unique Values and Value Counts¶

`unique()` and `nunique()`¶

`value_counts()`¶

One Hot Encoding¶

Binning Continous Variables¶

Data Aggregation¶

GroupBy¶

Using Multiple Keys¶

Using `aggregate()`¶

Selecting a Subset of Columns¶

Pivot Tables¶

Multi-level Pivot Table¶

Using `aggfunc`¶

`describe()` method¶

`describe()` on GroupBy¶

Combining Data¶

`concat()`¶

`append()`¶

`merge()`¶

Inner Join¶

Left Join¶

Right Join¶

Outer Join¶

Using index to merge¶

`join()`¶

Hierarchical Indexing¶

Multi-indexed Series¶

Subset Selection¶

`unstack()` and `stack()`¶

Multi-indexed DataFrame¶

Subset Selection¶

Sorting¶

By Index and Level¶

By Value¶

Data Aggregations¶

Conclusion¶

References¶

	total_bill	tip	sex	smoker	time	size
day
Sun	16.99	1.01	Female	No	Dinner	2
Sun	10.34	1.66	Male	No	Dinner	3
Sun	21.01	3.50	Male	No	Dinner	3
Sun	23.68	3.31	Male	No	Dinner	2
Sun	24.59	3.61	Female	No	Dinner	4

	total_bill	tip	size
sex
Male	20.744076	3.089618	2.630573
Female	18.056897	2.833448	2.459770

		total_bill	tip	size
smoker	day
Yes	Thur	16.470	2.560	2
	Fri	13.420	2.500	2
	Sat	20.390	2.690	2
	Sun	23.100	3.500	2
No	Thur	15.950	2.180	2
	Fri	19.235	3.125	2
	Sat	17.820	2.750	2
	Sun	18.430	3.020	3

		total_bill			tip			size
		min	median	max	min	median	max	min	median	max
smoker	day
Yes	Thur	10.34	16.470	43.11	2.00	2.560	5.00	2	2	4
	Fri	5.75	13.420	40.17	1.00	2.500	4.73	1	2	4
	Sat	3.07	20.390	50.81	1.00	2.690	10.00	1	2	5
	Sun	7.25	23.100	45.35	1.50	3.500	6.50	2	2	5
No	Thur	7.51	15.950	41.19	1.25	2.180	6.70	1	2	6
	Fri	12.46	19.235	22.75	1.50	3.125	3.50	2	2	3
	Sat	7.25	17.820	48.33	1.00	2.750	9.00	1	2	4
	Sun	8.77	18.430	48.17	1.01	3.020	6.00	2	3	6

		total_bill			tip
		min	max	count	max
smoker	day
Yes	Thur	10.34	43.11	17	5.00
	Fri	5.75	40.17	15	4.73
	Sat	3.07	50.81	42	10.00
	Sun	7.25	45.35	19	6.50
No	Thur	7.51	41.19	45	6.70
	Fri	12.46	22.75	4	3.50
	Sat	7.25	48.33	45	9.00
	Sun	8.77	48.17	57	6.00

	class	First	Second	Third
sex	age
female	(0, 18]	0.909091	1.000000	0.511628
female	(18, 80]	0.972973	0.900000	0.423729
male	(0, 18]	0.800000	0.600000	0.215686
male	(18, 80]	0.375000	0.071429	0.133663

	total_bill	tip	size
count	244.000000	244.000000	244.000000
mean	19.785943	2.998279	2.569672
std	8.902412	1.383638	0.951100
min	3.070000	1.000000	1.000000
25%	13.347500	2.000000	2.000000
50%	17.795000	2.900000	2.000000
75%	24.127500	3.562500	3.000000
max	50.810000	10.000000	6.000000

		count	mean	std	min	25%	50%	75%	max
smoker	day
Yes	Thur	17.0	19.190588	8.355149	10.34	13.510	16.470	19.8100	43.11
	Fri	15.0	16.813333	9.086388	5.75	11.690	13.420	18.6650	40.17
	Sat	42.0	21.276667	10.069138	3.07	13.405	20.390	26.7925	50.81
	Sun	19.0	24.120000	10.442511	7.25	17.165	23.100	32.3750	45.35
No	Thur	45.0	17.113111	7.721728	7.51	11.690	15.950	20.2700	41.19
	Fri	4.0	18.420000	5.059282	12.46	15.100	19.235	22.5550	22.75
	Sat	45.0	19.661778	8.939181	7.25	14.730	17.820	20.6500	48.33
	Sun	57.0	20.506667	8.130189	8.77	14.780	18.430	25.0000	48.17

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

class	First	Second	Third	All
sex
female	91	70	72	233
male	45	17	47	109
All	136	87	119	342

	name	salary
0	John	70000
1	Jake	80000
2	Jane	120000
3	Suzi	65000
4	Chad	90000

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

	employee	group	skills
0	John	Accounting	math
1	John	Accounting	spreadsheets
2	Jake	Engineering	coding
3	Jake	Engineering	linux
4	Jane	Engineering	coding
5	Jane	Engineering	linux
6	Suzi	Management	spreadsheets
7	Suzi	Management	organization

	employee	skills
group
Accounting	John	math
Accounting	John	spreadsheets
Engineering	Jake	coding
Engineering	Jake	linux
Engineering	Jane	coding
Engineering	Jane	linux
Management	Suzi	spreadsheets
Management	Suzi	organization

	sales person	John		Jane		Ben
	product	Product A	Product B	Product A	Product B	Product A	Product B
year	quarter
2019	2	49.0	56.3	54.0	55.4	48.0	54.5
2019	1	51.0	55.7	58.0	53.5	67.0	52.9
2018	2	43.0	54.9	52.0	55.9	46.0	55.0
2018	1	39.0	55.6	62.0	56.3	39.0	55.2

Part 4 - Productivity with Pandas¶

Table of Contents

Working with Categorical Data¶

Unique Values and Value Counts¶

unique() and nunique()¶

value_counts()¶

One Hot Encoding¶

Binning Continous Variables¶

Data Aggregation¶

GroupBy¶

Using Multiple Keys¶

Using aggregate()¶

Selecting a Subset of Columns¶

Pivot Tables¶

Multi-level Pivot Table¶

Using aggfunc¶

describe() method¶

describe() on GroupBy¶

Combining Data¶

concat()¶

append()¶

merge()¶

Inner Join¶

Left Join¶

Right Join¶

Outer Join¶

Using index to merge¶

join()¶

Hierarchical Indexing¶

Multi-indexed Series¶

Subset Selection¶

unstack() and stack()¶

Multi-indexed DataFrame¶

Subset Selection¶

Sorting¶

By Index and Level¶

By Value¶

Data Aggregations¶

Conclusion¶

References¶

`unique()` and `nunique()`¶

`value_counts()`¶

Using `aggregate()`¶

Using `aggfunc`¶

`describe()` method¶

`describe()` on GroupBy¶

`concat()`¶

`append()`¶

`merge()`¶

`join()`¶

`unstack()` and `stack()`¶

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0

	one	two	three	four
a	0.0	1.0	NaN	NaN
b	2.0	3.0	NaN	NaN
c	4.0	5.0	NaN	NaN
a	NaN	NaN	5.0	6.0
c	NaN	NaN	7.0	8.0