...
Code Block |
---|
a = ["Jake", "John", "Eric"] b = ["John", "Jill"] print(set(a).difference(set(b))) |
Pandas DataFrame / CSV / Join / Merge
...
Generator
Random number generation
Code Block |
---|
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"], "capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],import random def lottery(): # returns 6 numbers between 1 and 40 for i in range(6): "area": [8.516, 17.10, 3.286, 9.597, 1.221], yield random.randint(1, 40) # returns a "population": [200.4, 143.5, 1252, 1357, 52.98] } import pandas as pd brics = pd.DataFrame(dict) print(brics) |
Adding index to a Pandas DataFrame
Code Block |
---|
# Set the index for brics
brics.index = ["BR", "RU", "IN", "CH", "SA"]
# Print out brics with new index values
print(brics) |
Reading CSV by Pandas DataFrame
Code Block |
---|
# Import pandas as pd
import pandas as pd
# Import the cars.csv data: cars
cars = pd.read_csv('cars.csv')
# Print out cars
print(cars) |
Reading a CSV file by Pandas DataFrame with 1st column as index
Code Block |
---|
# Import pandas and cars.csv
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out country column as Pandas Series
print(cars['cars_per_cap'])
# Print out country column as Pandas DataFrame
print(cars[['cars_per_cap']])
# Print out DataFrame with country and drives_right columns
print(cars[['cars_per_cap', 'country']]) |
Save a Pandas DaraFrame by CSV format
Code Block |
---|
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
"capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
"area": [8.516, 17.10, 3.286, 9.597, 1.221],
"population": [200.4, 143.5, 1252, 1357, 52.98] }
import pandas as pd
brics = pd.DataFrame(dict)
brics.to_csv('example.csv') |
Save a Pandas DaraFrame by CSV format with header and no index
Code Block |
---|
from pandas import DataFrame
Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [22000,25000,27000,35000]
}
df = DataFrame(Cars, columns= ['Brand', 'Price'])
export_csv = df.to_csv (r'C:\Users\Ron\Desktop\export_dataframe.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path
print (df) |
Print partial rows (observations) from a Pandas DataFrame
Code Block |
---|
# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out first 4 observations
print(cars[0:4])
# Print out fifth, sixth, and seventh observation
print(cars[4:6]) |
Data access by loc and iloc in Pandas DaraFrame - Select colums by index or name
loc is label-based, and iloc is integer index based
Code Block |
---|
# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out observation for Japan
print(cars.iloc[2])
# Print out observations for Australia and Egypt
print(cars.loc[['AUS', 'EG']]) |
Sort a Pandas DataFrame in an ascending order
Info |
---|
df.sort_values(by=['Brand'], inplace=True) |
7th number between 1 and 15
yield random.randint(1,15)
for random_number in lottery():
print("And the next number is... %d!" %(random_number)) |
Swap variables' value
Code Block |
---|
a = 1
b = 2
a, b = b, a
print(a,b) |
Fibonacci series generator
The first two numbers of the series is always equal to 1, and each consecutive number returned is the sum of the last two numbers - the below code uses only two variables to get the result.
Code Block |
---|
def fib():
a, b = 1, 1
while 1:
yield a
a, b = b, a + b
# testing code
import types
if type(fib()) == types.GeneratorType:
print("Good, The fib function is a generator.")
counter = 0
for n in fib():
print(n)
counter += 1
if counter == 10:
break |
Function Arguments(Parameters)
Multiple Function Argument recognition - the list of "therest" parameters
Code Block |
---|
def foo(first, second, third, *therest):
print("First: %s" %(first))
print("Second: %s" %(second))
print("Third: %s" %(third))
print("And all the rest... %s" %(list(therest)))
foo(1,2,3,4,5) |
Multiple Function Argument by keyword
Code Block |
---|
def bar(first, second, third, **options):
if options.get("action") == "sum":
print("The sum is: %d" %(first + second + third))
if options.get("number") == "first":
return first
result = bar(1, 2, 3, action = "sum", number = "first")
print("Result: %d" %(result)) |
Regular Expression
RegEx(Regular Expressions) to search "[on]" or "[off]" on the string
Code Block |
---|
import re
pattern = re.compile(r"\[(on|off)\]") # Slight optimization
print(re.search(pattern, "Mono: Playback 65 [75%] [-16.50dB] [on]")) |
RegEx(Regular Expression) to check email address
Code Block |
---|
import re
def test_email(your_pattern):
pattern = re.compile(your_pattern)
emails = ["john@example.com", "python-list@python.org", "wha.t.`1an?ug{}ly@email.com"]
for email in emails:
if not re.match(pattern, email): |
Code Block |
# sort - ascending order from pandas import DataFrame Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'], 'Price': [22000,25000,27000,35000], 'Year': [2015,2013,2018,2018] } df = DataFrame(Cars, columns= ['Brand', 'Price','Year']) # sort Brand - ascending order df.sort_values(by=['Brand'], inplace=True) print (df) |
Sort a Pandas DataFrame in a descending order
Info |
---|
df.sort_values(by=['Brand'], inplace=True, ascending=False) |
Code Block |
---|
# sort - descending order from pandas import DataFrame Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],print("You failed to match %s" % (email)) elif not your_pattern: print("Forgot to enter a pattern!") 'Price': [22000,25000,27000,35000],else: 'Year': [2015,2013,2018,2018] } df = DataFrame(Cars, columns= ['Brand', 'Price','Year']) # sort Brand - descending order df.sort_values(by=['Brand'], inplace=True, ascending=False) print (df) |
Sort a Pandas DataFrame by multiple columns
Info |
---|
df.sort_values(by=['First Column','Second Column',...], inplace=True) |
print("Pass")
pattern = r"[a-z0-9]+@[a-z0-9]+\.[a-z0-9]+"
test_email(pattern) |
Exception Handling
try/except block
Code Block |
---|
def do_stuff_with_number(n):
print(n)
def catch_this():
the_list = (1, 2, 3, 4, 5)
for i in range(20): |
Code Block |
# sort by multiple columns from pandas import DataFrame Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'], 'Price': [22000,25000,27000,35000], 'Year'try: [2015,2013,2018,2018] } df = DataFrame(Cars, columns= ['Brand', 'Price','Year']) # sort by multiple columns: Year and Price df.sort_values(by=['Year','Price'], inplace=True) print (df) |
Join and merge Pandas DataFrames
do_stuff_with_number(the_list[i])
except IndexError: # Raised when accessing a non-existing index of a list
do_stuff_with_number('out of bound - %d' % i)
catch_this()
|
Pandas DataFrame / CSV / Join / Merge
Create a Pandas DataFrame based on array
Code Block |
---|
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa" |
Code Block |
import pandas as pd from IPython.display import display from IPython.display import Image raw_data = { 'subject_id': ['1', '2', '3', '4', '5'], 'first_name'"capital": ['Alex'"Brasilia", 'Amy', 'Allen', 'Alice', 'Ayoung'], "Moscow", "New Dehli", "Beijing", "Pretoria"], 'last_name'"area": ['Anderson'8.516, 'Ackerman'17.10, 'Ali'3.286, 'Aoni'9.597, 'Atiches']} df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name']) raw_data = { 'subject_id': ['4', '5', '6', '7', '8'], 'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']} df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name']) raw_data = { 'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'], 'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]} df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id']) # Join the two dataframes along rows df_new = pd.concat([df_a, df_b]) # Join the two dataframes along columns pd.concat([df_a, df_b], axis=1) # Merge two dataframes along the subject_id value pd.merge(df_new, df_n, on='subject_id') # Merge two dataframes with both the left and right dataframes using the subject_id key pd.merge(df_new, df_n, left_on='subject_id', right_on='subject_id') # Merge with outer join pd.merge(df_a, df_b, on='subject_id', how='outer') # Merge with inner join pd.merge(df_a, df_b, on='subject_id', how='inner') # Merge with right join pd.merge(df_a, df_b, on='subject_id', how='right') # Merge with left join pd.merge(df_a, df_b, on='subject_id', how='left') # Merge while adding a suffix to duplicate column names pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right')) # Merge based on indexes pd.merge(df_a, df_b, right_index=True, left_index=True) |
Get the maximum value of column in Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
# Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]
}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# get the maximum values of all the column in dataframe - it will be raghu, 26, 89, object
df.max()
# get the maximum value of the column 'Age' - it will be 26
df['Age'].max()
# get the maximum value of the column 'Name' - it will be raghu
df['Name'].max() |
Get the minimum value of column in Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
# Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]
}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# get the minimum values of all the column in dataframe - it will display Alex, 22, 31, object
df.min()
# get the minimum value of the column 'Age' - it will be 22
df['Age'].min()
# get the minimum value of the column 'Name' - it will be Alex
df['Name'].min() |
Select row with maximum and minimum value in Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# get the row of max value
df.loc[df['Score'].idxmax()]
# get the row of minimum value
df.loc[df['Score'].idxmin()] |
Get the unique values (rows) of a Pandas Dataframe
Code Block |
---|
Create Dataframe:
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24]
}
df = pd.DataFrame(d,columns=['Name','Age'])
# get the unique values (rows)
print df.drop_duplicates()
# get the unique values (rows) by retaining last row
print df.drop_duplicates(keep='last') |
Get the list of column headers or column name in a Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# method 1: get list of column name
list(df.columns.values)
# method 2: get list of column name
list(df) |
Delete or Drop the duplicate row of a Pandas DataFrame
1.221],
"population": [200.4, 143.5, 1252, 1357, 52.98] }
import pandas as pd
brics = pd.DataFrame(dict)
print(brics) |
Adding index to a Pandas DataFrame
Code Block |
---|
# Set the index for brics
brics.index = ["BR", "RU", "IN", "CH", "SA"]
# Print out brics with new index values
print(brics) |
Reading CSV by Pandas DataFrame
Code Block |
---|
# Import pandas as pd
import pandas as pd
# Import the cars.csv data: cars
cars = pd.read_csv('cars.csv')
# Print out cars
print(cars) |
Reading a CSV file by Pandas DataFrame with 1st column as index
Code Block |
---|
# Import pandas and cars.csv
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out country column as Pandas Series
print(cars['cars_per_cap'])
# Print out country column as Pandas DataFrame
print(cars[['cars_per_cap']])
# Print out DataFrame with country and drives_right columns
print(cars[['cars_per_cap', 'country']]) |
Save a Pandas DaraFrame by CSV format
Code Block |
---|
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
"capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
"area": [8.516, 17.10, 3.286, 9.597, 1.221],
"population": [200.4, 143.5, 1252, 1357, 52.98] }
import pandas as pd
brics = pd.DataFrame(dict)
brics.to_csv('example.csv') |
Save a Pandas DaraFrame by CSV format with header and no index
Code Block |
---|
from pandas import DataFrame
Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [22000,25000,27000,35000]
}
df = DataFrame(Cars, columns= ['Brand', 'Price'])
export_csv = df.to_csv (r'C:\Users\Ron\Desktop\export_dataframe.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path
print (df) |
Print partial rows (observations) from a Pandas DataFrame
Code Block |
---|
# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out first 4 observations
print(cars[0:4])
# Print out fifth, sixth, and seventh observation
print(cars[4:6]) |
Data access by loc and iloc in Pandas DaraFrame - Select colums by index or name
loc is label-based, and iloc is integer index based
Code Block |
---|
# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)
# Print out observation for Japan
print(cars.iloc[2])
# Print out observations for Australia and Egypt
print(cars.loc[['AUS', 'EG']]) |
Sort a Pandas DataFrame in an ascending order
Info |
---|
df.sort_values(by=['Brand'], inplace=True) |
Code Block |
---|
# sort - ascending order
from pandas import DataFrame
Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [22000,25000,27000,35000],
'Year': [2015,2013,2018,2018]
}
df = DataFrame(Cars, columns= ['Brand', 'Price','Year'])
# sort Brand - ascending order
df.sort_values(by=['Brand'], inplace=True)
print (df) |
Sort a Pandas DataFrame in a descending order
Info |
---|
df.sort_values(by=['Brand'], inplace=True, ascending=False) |
Code Block |
---|
# sort - descending order
from pandas import DataFrame
Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [22000,25000,27000,35000],
'Year': [2015,2013,2018,2018]
}
df = DataFrame(Cars, columns= ['Brand', 'Price','Year'])
# sort Brand - descending order
df.sort_values(by=['Brand'], inplace=True, ascending=False)
print (df) |
Sort a Pandas DataFrame by multiple columns
Info |
---|
df.sort_values(by=['First Column','Second Column',...], inplace=True) |
Code Block |
---|
# sort by multiple columns
from pandas import DataFrame
Cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [22000,25000,27000,35000],
'Year': [2015,2013,2018,2018]
}
df = DataFrame(Cars, columns= ['Brand', 'Price','Year'])
# sort by multiple columns: Year and Price
df.sort_values(by=['Year','Price'], inplace=True)
print (df) |
Join and merge Pandas DataFrames
Code Block |
---|
import pandas as pd
from IPython.display import display
from IPython.display import Image
raw_data = {
'subject_id': ['1', '2', '3', '4', '5'],
'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
raw_data = {
'subject_id': ['4', '5', '6', '7', '8'],
'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
raw_data = {
'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
# Join the two dataframes along rows
df_new = pd.concat([df_a, df_b])
# Join the two dataframes along columns
pd.concat([df_a, df_b], axis=1)
# Merge two dataframes along the subject_id value
pd.merge(df_new, df_n, on='subject_id')
# Merge two dataframes with both the left and right dataframes using the subject_id key
pd.merge(df_new, df_n, left_on='subject_id', right_on='subject_id')
# Merge with outer join
pd.merge(df_a, df_b, on='subject_id', how='outer')
# Merge with inner join
pd.merge(df_a, df_b, on='subject_id', how='inner')
# Merge with right join
pd.merge(df_a, df_b, on='subject_id', how='right')
# Merge with left join
pd.merge(df_a, df_b, on='subject_id', how='left')
# Merge while adding a suffix to duplicate column names
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right'))
# Merge based on indexes
pd.merge(df_a, df_b, right_index=True, left_index=True) |
Get the maximum value of column in Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
# Create |
Code Block |
import pandas as pd import numpy as np #Create a DataFrame d = { 'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine', 'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'], 'Age':[26,24,23,22,23,24,26,24,22,23,24,24], 'Score':[85,63,55,74,31,77,85,63,42,62,89,77] } df = pd.DataFrame(d,columns=['Name','Age','Score']) # get dropthe duplicate rows df.drop_duplicates() # drop duplicate rows by retaining last occurrence df.drop_duplicates(keep='last'maximum values of all the column in dataframe - it will be raghu, 26, 89, object df.max() # drop duplicate by a column name df.drop_duplicates(['Name'], keep='last') |
Drop or delete the row in Pandas DataFrame with conditions
get the maximum value of the column 'Age' - it will be 26
df['Age'].max()
# get the maximum value of the column 'Name' - it will be raghu
df['Name'].max() |
Get the minimum value of column in Pandas DataFrame
Code Block |
---|
import pandas as pd
|
Code Block |
import numpy as np #Create# Create a DataFrame d = { 'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine', 'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'], 'Age':[26,24,23,22,23,24,26,24,22,23,24,24], 'Score':[85,63,55,74,31,77,85,63,42,62,89,77] } df = pd.DataFrame(d,columns=['Name','Age','Score']) # Dropget anthe observationminimum or rowvalues of all the column in dataframe - it will display Alex, 22, 31, object df.dropmin([1,2]) # Dropget the aminimum rowvalue by condition df[df.Name != 'Alisa'] # Drop a row by index df.drop(df.index[2]) # Drop bottom 3 rows df[:-3] |
...
of the column 'Age' - it will be 22
df['Age'].min()
# get the minimum value of the column 'Name' - it will be Alex
df['Name'].min() |
Select row with maximum and minimum value in Pandas DataFrame
Code Block |
---|
import pandas as pd import numpy as np #Create a DataFrame d = { 'countriesName':['AAlisa','BBobby','Cjodha'], 'population_in_million':[100,200,120]'jack','raghu','Cathrine', 'gdp_percapita':[2000,7000,15000] } df = pd.DataFrame(d,columns=['countries','population_in_million','gdp_percapita']) # shape from wide to long with melt function in pandas df2=pd.melt(df,id_vars=['countries'],var_name='metrics', value_name='values') |
Reshape long to wide in Pandas DataFrame with pivot function
Code Block |
---|
import pandas as pd import numpy as np #Create a DataFrame d = { 'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'], 'Age':[26,24,23,22,23,24,26,24,22,23,24,24], 'countriesScore':['A','B','C','A','B','C'], 'metrics':['population_in_million'85,63,55,74,31,77,85,63,42,62,89,77]} df = pd.DataFrame(d,columns=['Name','Age','Score']) # get the row of max value df.loc[df['Score'].idxmax()] # get the row of minimum value df.loc[df['Score'].idxmin()] |
Get the unique values (rows) of a Pandas Dataframe
Code Block |
---|
Create Dataframe: import pandas as pd import numpy as np #Create a DataFrame d = { 'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine', 'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'], 'Age':[26,24,23,22,23,24,26,24,22,23,24,24] ,'population_in_million','population_in_million', 'gdp_percapita','gdp_percapita','gdp_percapita'], 'values':[100,200,120,2000,7000,15000] } df = pd.DataFrame(d,columns=['countriesName','metricsAge','values']) # reshapeget fromthe longunique to wide in pandas python df2=df.pivot(index='countries', columns='metrics', values='values') |
Reshape using Stack() and unstack() function in Pandas DataFrame
values (rows)
print df.drop_duplicates()
# get the unique values (rows) by retaining last row
print df.drop_duplicates(keep='last') |
Get the list of column headers or column name in a Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
|
Code Block |
import pandas as pd import numpy as np header = pd.MultiIndex.from_product([['Semester1','Semester2'],['Maths','Science']]) d=([[12,45,67,56],[78,89,45,67],[45,67,89,90],[67,44,56,55]]) df = pd.DataFrame(d, index=['Alisa','Bobby','kumar','Alisa','CathrineAlex','JackCathrine'], 'Age':[26,24,23,22,23,24,26,24,22,23,24,24], columns=header) # stack the dataframe stacked_df=df.stack() # unstack the dataframe unstacked_df = stacked_df.unstack( 'Score':[85,63,55,74,31,77,85,63,42,62,89,77]} df = pd.DataFrame(d,columns=['Name','Age','Score']) # stack the dataframemethod 1: get list of column at level 0 stacked_df_lvl=df.stack(level=0name list(df.columns.values) # method 2: unstackget the dataframe unstacked_df1 = stacked_df_lvl.unstack() |
Generator
Random number generation
list of column name
list(df) |
Delete or Drop the duplicate row of a Pandas DataFrame
Code Block |
---|
import random
def lottery():
# returns 6 numbers between 1 and 40
for i in range(6):
yield random.randint(1, 40)
# returns a 7th number between 1 and 15
yield random.randint(1,15)
for random_number in lottery():
print("And the next number is... %d!" %(random_number)) |
Swap variables' value
Code Block |
---|
a = 1
b = 2
a, b = b, a
print(a,b) |
Fibonacci series generator
The first two numbers of the series is always equal to 1, and each consecutive number returned is the sum of the last two numbers - the below code uses only two variables to get the result.
Code Block |
---|
def fib():
a, b = 1, 1
while 1:
yield a
a, b = b, a + b
# testing code
import types
if type(fib()) == types.GeneratorType:
print("Good, The fib function is a generator.")
counter = 0
for n in fib():
print(n)
counter += 1
if counter == 10:
break |
Function Arguments(Parameters)
Multiple Function Argument recognition - the list of "therest" parameters
Code Block |
---|
def foo(first, second, third, *therest):
print("First: %s" %(first))
print("Second: %s" %(second))
print("Third: %s" %(third))
print("And all the rest... %s" %(list(therest)))
foo(1,2,3,4,5) |
Multiple Function Argument by keyword
Code Block |
---|
def bar(first, second, third, **options):
if options.get("action") == "sum":
print("The sum is: %d" %(first + second + third))
if options.get("number") == "first":
return first
result = bar(1, 2, 3, action = "sum", number = "first")
print("Result: %d" %(result)) |
Regular Expression
RegEx(Regular Expressions) to search "[on]" or "[off]" on the string
Code Block |
---|
import re
pattern = re.compile(r"\[(on|off)\]") # Slight optimization
print(re.search(pattern, "Mono: Playback 65 [75%] [-16.50dB] [on]")) |
RegEx(Regular Expression) to check email address
Code Block |
---|
import re
def test_email(your_pattern):
pattern = re.compile(your_pattern)
emails = ["john@example.com", "python-list@python.org", "wha.t.`1an?ug{}ly@email.com"]
for email in emails:
if not re.match(pattern, email):
print("You failed to match %s" % (email))
elif not your_pattern:
print("Forgot to enter a pattern!")
else:
print("Pass")
pattern = r"[a-z0-9]+@[a-z0-9]+\.[a-z0-9]+"
test_email(pattern) |
Exception Handling
try/except block
pandas as pd
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# drop duplicate rows
df.drop_duplicates()
# drop duplicate rows by retaining last occurrence
df.drop_duplicates(keep='last')
# drop duplicate by a column name
df.drop_duplicates(['Name'], keep='last') |
Drop or delete the row in Pandas DataFrame with conditions
Code Block |
---|
import numpy as np
#Create a DataFrame
d = {
'Name':['Alisa','Bobby','jodha','jack','raghu','Cathrine',
'Alisa','Bobby','kumar','Alisa','Alex','Cathrine'],
'Age':[26,24,23,22,23,24,26,24,22,23,24,24],
'Score':[85,63,55,74,31,77,85,63,42,62,89,77]}
df = pd.DataFrame(d,columns=['Name','Age','Score'])
# Drop an observation or row
df.drop([1,2])
# Drop a row by condition
df[df.Name != 'Alisa']
# Drop a row by index
df.drop(df.index[2])
# Drop bottom 3 rows
df[:-3] |
Reshape wide to long in Pandas DataFrame with melt() function
Code Block |
---|
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'countries':['A','B','C'],
'population_in_million':[100,200,120],
'gdp_percapita':[2000,7000,15000]
}
df = pd.DataFrame(d,columns=['countries','population_in_million','gdp_percapita'])
# shape from wide to long with melt function in pandas
df2=pd.melt(df,id_vars=['countries'],var_name='metrics', value_name='values') |
Reshape long to wide in Pandas DataFrame with pivot function
Code Block |
---|
import pandas as pd
import numpy as np
#Create a DataFrame
d = {
'countries':['A','B','C','A','B','C'],
'metrics':['population_in_million','population_in_million','population_in_million',
'gdp_percapita','gdp_percapita','gdp_percapita'],
'values':[100,200,120,2000,7000,15000]
}
df = pd.DataFrame(d,columns=['countries','metrics','values'])
# reshape from long to wide in pandas python
df2=df.pivot(index='countries', columns='metrics', values='values')
|
Reshape using Stack() and unstack() function in Pandas DataFrame
Code Block |
---|
import pandas as pd
import numpy as np
header = pd.MultiIndex.from_product([['Semester1','Semester2'],['Maths','Science']])
d=([[12,45,67,56],[78,89,45,67],[45,67,89,90],[67,44,56,55]])
df = pd.DataFrame(d,
index=['Alisa','Bobby','Cathrine','Jack'],
columns=header)
# stack the dataframe
stacked_df=df.stack()
# unstack the dataframe
unstacked_df = stacked_df.unstack()
# stack the dataframe of column at level 0
stacked_df_lvl=df.stack(level=0)
# unstack the dataframe
unstacked_df1 = stacked_df_lvl.unstack() |
Code Block |
def do_stuff_with_number(n):
print(n)
def catch_this():
the_list = (1, 2, 3, 4, 5)
for i in range(20):
try:
do_stuff_with_number(the_list[i])
except IndexError: # Raised when accessing a non-existing index of a list
do_stuff_with_number('out of bound - %d' % i)
catch_this()
|