Introduction to Pandas in Python

This is an introduction to the Pandas module in Python. I demonstrate core Pandas functionality using a tabular dataset from OICA.

# import pandas
import pandas

# read in oica.csv file with data
pandas.read_csv("oica.csv",sep=",")

	year	country	output
0	2018	Argentina	466649
1	2018	Austria	164900
2	2018	Belgium	308493
3	2018	Brazil	2879809
4	2018	Canada	2020840
…	…	…	…
835	1999	Ukraine	1918
836	1999	UK	1973519
837	1999	USA	13024978
838	1999	Uzbekistan	44433
839	1999	Others	11965

840 rows × 3 columns

# read in oica.csv and store to object
data_df = pandas.read_csv("oica.csv",sep=",")

# check the type of the data object to which read_csv stored data
type(data_df)

pandas.core.frame.DataFrame

# check the dimensions (shape) of the data frame
data_df.shape

(840, 3)

# check a info summary for the data frame object
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 3 columns):
year        840 non-null int64
country     840 non-null object
output      840 non-null int64
dtypes: int64(2), object(1)
memory usage: 19.8+ KB

# check the header of the data frame object
data_df.head()

	year	country	output
0	2018	Argentina	466649
1	2018	Austria	164900
2	2018	Belgium	308493
3	2018	Brazil	2879809
4	2018	Canada	2020840

# check the tail of the data frame object
data_df.tail()

	year	country	output
835	1999	Ukraine	1918
836	1999	UK	1973519
837	1999	USA	13024978
838	1999	Uzbekistan	44433
839	1999	Others	11965

# check column names
data_df.columns

Index(['year', 'country ', 'output '], dtype='object')

# check row index range
data_df.index

RangeIndex(start=0, stop=840, step=1)

# check the body of the data_df object (numpy representation of data frame);
# may be useful when transfering data to different frame works
data_df.values

array([[2018, 'Argentina', 466649],
       [2018, 'Austria', 164900],
       [2018, 'Belgium', 308493],
       ...,
       [1999, 'USA', 13024978],
       [1999, 'Uzbekistan', 44433],
       [1999, 'Others', 11965]], dtype=object)

# get datatype overview by column
data_df.dtypes

year         int64
country     object
output       int64
dtype: object

# access column, here "country" column (subsetting column)
country_col = data_df["country "]
country_col

0       Argentina
1         Austria
2         Belgium
3          Brazil
4          Canada
          ...    
835       Ukraine
836            UK
837           USA
838    Uzbekistan
839        Others
Name: country , Length: 840, dtype: object

# check type of column
type(country_col)

pandas.core.series.Series

# return data frame instead of series when subsetting from data_df
country_col = data_df[["country "]] # subsetting a list of values from the series
country_col

	country
0	Argentina
1	Austria
2	Belgium
3	Brazil
4	Canada
…	…
835	Ukraine
836	UK
837	USA
838	Uzbekistan
839	Others

840 rows × 1 columns

# check type of country_col now
type(country_col)

pandas.core.frame.DataFrame

# dropping column in data frame
data_df.drop(["country "],axis="columns")

	year	output
0	2018	466649
1	2018	164900
2	2018	308493
3	2018	2879809
4	2018	2020840
…	…	…
835	1999	1918
836	1999	1973519
837	1999	13024978
838	1999	44433
839	1999	11965

840 rows × 2 columns

# subsetting rows using .loc
data_df.loc[0]

year             2018
country     Argentina
output         466649
Name: 0, dtype: object

# subsetting specific cell, using .loc
data_df.loc[0][1]

'Argentina'

# subsetting list of rows, using .loc
data_df.loc[[0,1,3,6,9,10]]

	year	country	output
0	2018	Argentina	466649
1	2018	Austria	164900
3	2018	Brazil	2879809
6	2018	Colombia	72800
9	2018	Finland	112104
10	2018	France	2270000

# unlike python lists, -1 will not work with .loc
data_df.loc[-1]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    375             try:
--> 376                 return self._range.index(new_key)
    377             except ValueError:

ValueError: -1 is not in range

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-38-404dbc95dcdf> in <module>
      1 # unlike python lists, -1 will not work with .loc
----> 2 data_df.loc[-1]

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1422 
   1423             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1424             return self._getitem_axis(maybe_callable, axis=axis)
   1425 
   1426     def _is_scalar_access(self, key: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1848         # fall thru to straight lookup
   1849         self._validate_key(key, axis)
-> 1850         return self._get_label(key, axis=axis)
   1851 
   1852 

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
    158             raise IndexingError("no slices here, handle elsewhere")
    159 
--> 160         return self.obj._xs(label, axis=axis)
    161 
    162     def _get_loc(self, key: int, axis: int):

~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3735             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3736         else:
-> 3737             loc = self.index.get_loc(key)
   3738 
   3739             if isinstance(loc, np.ndarray):

~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    376                 return self._range.index(new_key)
    377             except ValueError:
--> 378                 raise KeyError(key)
    379         return super().get_loc(key, method=method, tolerance=tolerance)
    380 

KeyError: -1

# but, using .iloc we can use -1
data_df.iloc[-1]

year          1999
country     Others
output       11965
Name: 839, dtype: object

# above return object is of type series
# below: subsetting such that data frame is returned
data_df.iloc[[-1]]

	year	country	output
839	1999	Others	11965

# subsetting data frame, accessing several rows using .iloc
data_df.iloc[[-1,0,1]]

	year	country	output
839	1999	Others	11965
0	2018	Argentina	466649
1	2018	Austria	164900

# using slicing notation ":" to access all rows, using .loc
data_df.loc[:,["year"]].head()

	year
0	2018
1	2018
2	2018
3	2018
4	2018

# the same as above, using .iloc
data_df.iloc[:,[0]].head()

	year
0	2018
1	2018
2	2018
3	2018
4	2018

# filtering when subsetting, using .loc
data_df.loc[data_df["country "] == "Japan"].head()

	year	country	output
17	2018	Japan	9728528
56	2017	Japan	9693746
97	2016	Japan	9204590
137	2015	Japan	9278238
177	2014	Japan	9774665

# subsetting and filtering, considering multiple variables and using .loc
data_df.loc[(data_df["country "] == "Japan") & (data_df["year"] == 2014)]

	year	country	output
177	2014	Japan	9774665

# using groupby for grouping data frame entries; 
# this will subset by that variable (separate data frame for each group)
data_df.groupby("country ")["output "].mean().head() # you could group by multiple variables; in that case use a list

country 
Argentina     429045.750000
Argentina     472158.000000
Australia     264759.250000
Austria       140122.571429
Belgium       711430.238095
Name: output , dtype: float64

# a more generic way, using numpy
import numpy
data_df.groupby("country ")["output "].agg(numpy.mean).tail()

country 
Turkey        8.993806e+05
UK            1.678415e+06
USA           1.097888e+07
Ukraine       8.167905e+04
Uzbekistan    6.586671e+04
Name: output , dtype: float64

# another example, passing on numpy.std
data_df.groupby("country ")["output "].agg(numpy.std).tail()

country 
Turkey        4.669412e+05
UK            1.943838e+05
USA           1.832615e+06
Ukraine       1.201598e+05
Uzbekistan    7.579244e+04
Name: output , dtype: float64

# using .reset_index for returning a regular pandas data frame object
data_df.groupby("country ")["output "].agg(numpy.std).reset_index().tail()

	country	output
38	Turkey	4.669412e+05
39	UK	1.943838e+05
40	USA	1.832615e+06
41	Ukraine	1.201598e+05
42	Uzbekistan	7.579244e+04

# pipe in python, using pandas - using parentheses "()"
(data_df
.groupby("country ")["output "]
.agg(numpy.std)
.reset_index()
.tail())

	country	output
38	Turkey	4.669412e+05
39	UK	1.943838e+05
40	USA	1.832615e+06
41	Ukraine	1.201598e+05
42	Uzbekistan	7.579244e+04

# pipe in python, using pandas - using backslash "\"
data_df\
.groupby("country ")["output "]\
.agg(numpy.std)\
.reset_index()\
.tail()

	country	output
38	Turkey	4.669412e+05
39	UK	1.943838e+05
40	USA	1.832615e+06
41	Ukraine	1.201598e+05
42	Uzbekistan	7.579244e+04

# using .str excessor
data_df["country "].str.capitalize().head()

0    Argentina
1      Austria
2      Belgium
3       Brazil
4       Canada
Name: country , dtype: object

# using the .str excessor from pandas data frame
data_df["country "].str.contains("A").head()

0     True
1     True
2    False
3    False
4    False
Name: country , dtype: bool

# assigning new column to existing data frame and populating it
data_df["test"] = 1
data_df.head()

	year	country	output	test
0	2018	Argentina	466649	1
1	2018	Austria	164900	1
2	2018	Belgium	308493	1
3	2018	Brazil	2879809	1
4	2018	Canada	2020840	1

# remove additional column
data_df = data_df[["year","country ","output "]]
data_df.head()

	year	country	output
0	2018	Argentina	466649
1	2018	Austria	164900
2	2018	Belgium	308493
3	2018	Brazil	2879809
4	2018	Canada	2020840

# using .pivot_table
pivot_df = data_df.pivot_table(index = ["country "],
                              columns = ["year"],
                              values = ["output "]).reset_index()
pivot_df.tail()

	country	output
year		1999	2000	2001	2002	2003	2004	2005	2006	2007	…	2009	2010	2011	2012	2013	2014	2015	2016	2017	2018
38	Turkey	297862.0	430947.0	270685.0	346565.0	346565.0	678381.0	879452.0	98778.0	1099413.0	…	869605.0	1094557.0	1189131.0	1072978.0	1125534.0	1170445.0	1358796.0	1485927.0	1695731.0	1550150.0
39	UK	1973519.0	1813894.0	1685238.0	1823018.0	1823018.0	1851484.0	1803109.0	1648388.0	1750253.0	…	1090139.0	1393463.0	1463999.0	1576945.0	1597872.0	1598879.0	1682156.0	1816622.0	1749385.0	1604328.0
40	USA	13024978.0	12799857.0	11424689.0	12279582.0	12279582.0	12052179.0	11946653.0	11263986.0	10780729.0	…	5709431.0	7743093.0	8661535.0	10335765.0	11066432.0	11660702.0	12100095.0	12198137.0	11189985.0	11314705.0
41	Ukraine	1918.0	31255.0	31824.0	53773.0	53773.0	14739.0	215759.0	29526.0	402591.0	…	69295.0	83133.0	104654.0	76281.0	50449.0	28751.0	8244.0	5264.0	9542.0	6623.0
42	Uzbekistan	44433.0	32273.0	41005.0	29554.0	29554.0	63601.5	95814.0	110.0	1849.0	…	1179.0	15688.0	17956.0	16418.0	246641.0	24566.0	1854.0	88152.0	140247.0	220667.0

5 rows × 21 columns

# creating a pandas data frame
data2_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : ["a","b","c"]
})
# show data2_df
data2_df

	col1	col2
0	1	a
1	2	b
2	3	c

# writing and defining a simple function called "simple_function"
def simple_function(x):
    return x+1
# applying "simple_function" to all entries in "col1"
data2_df["col1"] = data2_df["col1"].apply(simple_function)

# view data2_df now, after having used .apply method
data2_df

	col1	col2
0	2	a
1	3	b
2	4	c

# applying default function "print" to entire data2_df, column by column
data2_df.apply(print)

0    2
1    3
2    4
Name: col1, dtype: object
0    a
1    b
2    c
Name: col2, dtype: object

col1    None
col2    None
dtype: object

# using apply in combination with e.g. numpy.mean(); numpy is smart and will recognize the different columns
def simple_function2(col):
    return numpy.mean(col)
# define a thrid data frame using pandas
data3_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# apply simple_function2 to data3_df
data3_df.apply(simple_function2).reset_index()

	index	0
0	col1	2.0
1	col2	8.0

# define data4_df, which contains same values as data3_df originally did
data4_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# define a function simple_function3 that can process in different columns, without numpy
def simple_function3(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x+y+z)/3
# apply to data3_df
data4_df.apply(simple_function3).reset_index()

	index	0
0	col1	2.0
1	col2	8.0

# simple functions such as mean can be directly accessed as a pandas data frame series method
data4_df["col1"].mean()

2.0

# basic mathematical operations can also be conducted e.g. column by column, i.e. column-wise
data4_df["sum"] = data4_df["col1"] + data4_df["col2"]
data4_df

	col1	col2	sum
0	1	9	10
1	2	8	10
2	3	7	10

# numpy has a function called vectorize; 
# numpy.vectorize() takes a function as input, and creates a vectorized version of that function
# lets define a functino that we want to use in vectorized form, based on a pandas data frame
def simple_function4(x,y):
    if (x == 20):
        return numpy.NaN 
    else:
        return(x+y) / 2
# create a new data frame with two columns; call it "data5_df"
data5_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# vectorize simple_function4 using numpy.vectorize()
simple_function4VEC = numpy.vectorize(simple_function4)
# now, apply simple_function4 to the data frame data5_df
simple_function4VEC(data5_df["col1"],data5_df["col2"])

array([5., 5., 5.])

# instead of passing the function to numpy.vectorize you can use a decorator
@numpy.vectorize
def simple_function5(x,y):
    if(x == 20):
        return numpy.NaN
    else: 
        return(x+y)/2
# try again, using data5_df
simple_function5(data5_df["col1"],data5_df["col2"])

array([5., 5., 5.])

Linnart Felkl

Data scientist focusing on simulation, optimization and modeling in R, SQL, VBA and Python

Cookie	Duration	Description
cookielawinfo-checkbox-analytics	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Analytics".
cookielawinfo-checkbox-functional	11 months	The cookie is set by GDPR cookie consent to record the user consent for the cookies in the category "Functional".
cookielawinfo-checkbox-necessary	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookies is used to store the user consent for the cookies in the category "Necessary".
cookielawinfo-checkbox-others	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Other.
cookielawinfo-checkbox-performance	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Performance".
viewed_cookie_policy	11 months	The cookie is set by the GDPR Cookie Consent plugin and is used to store whether or not user has consented to the use of cookies. It does not store any personal data.

Introduction to Pandas in Python

Leave a Reply

Leave a Reply Cancel reply

You May Also Like

Service facility allocation – an optimization

Warehouse receiving process simulation

Visualization of SimPy parking lot simulation results

Leave a Reply

Leave a Reply Cancel reply