Table of Contents

Python/Pandas Quick Reference Card

A “cheat sheet” of commonly used Python data tools, control structures and their syntax.

Full pandas documentation.

Python 2.7 documentation.

Comments

# single line comment

'''
multi
line
comments
'''

Common library imports

import geopandas as gpd
from geopandas.tools import sjoin
from geopandas.tools import overlay
import pandas as pd
from pandas.stats.api import ols
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import StringIO
import os
import sys

Note that on Anaconda, the typical path for packages is something like:

/opt/conda/envs/myenv/lib/python3.6/site-packages

Data import and export

read and write: csv, excel, shapefile, etc FIXME

filename = communityName + '\\output\\df_nonRes_bldgs.csv'
df_general = pd.read_csv(filename)

specify a single worksheet to read

df_tripsOD = pd.read_excel(filename, sheet_name='HBS_24')

specify multiple worksheets to read

l_purps = ['HBS-24','HBW-24','HBO-24','NHB-24']
d_df_tripsOD = pd.read_excel(filename, sheet_name=l_purps)
df_nonRes_bldgs.to_csv(filename, index=False)
df_structure.to_excel(filename)
df_parcels_0 = gpd.GeoDataFrame.from_file(myParcels.shp)
df_structure = gpd.read_file(myLayers.gdb, layer='structure')

Round trip export/import for json files, as Python dictionaries or lists. Requires import json.

filename = 'folderX/list_CEUDSector_ind.json'
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(l_CEUDSector_ind, f, ensure_ascii=False, indent=4)
filename = 'folderX/list_CEUDSector_ind.json'
with open(filename) as data_file:
    l_CEUDSector_ind= json.load(data_file)

Data interrogation

df_buildings.info()

option null_counts=True

To see the individual cell types

df_bld_0.DFRPPropertyNumber.map(type)
df_buildings.describe()
df_buildings.head()
df_buildings.tail()
df_buildings.sample(10)
df_buildings.StrucType.value_counts()
pt_structure = pd.pivot_table(df_structure,
    values='OBJECTID',
    index=['propType'],
    columns=['strucType'],
    margins=False,
    aggfunc=np.sum)

ref | Common aggfunc params: len, np.sum, np.mean

df_buildings.StrucType.unique()
df_buildings.StrucType.nunique()

Selecting data

Selecting rows

df_farm = df_structure[
    (df_structure.PropertyCode >= 200) &
    (df_structure.PropertyCode < 300)]

Can use bitwise operators: AND &, OR |, NOT ~

Other boolean operators: ==, >=, >, , <,!=, isin(myList), isnull(), notnull()

Selecting columns

df_condosLUT = df_condos.loc[:,['RollNumber','CondoRollN']]

Setting and filling data

df_buildings.loc[df_buildings.UNIQUEID == '6554354',
    'ASMTCODE'] = '303 - Warehouses (Including Cold Storage Plants)'

fillna() ref

df_building['YearBuilt'].fillna(2011, inplace=True)

Conditional if-then-else column-wise logic

df_structure['finalID'] = np.where(df_structure.condoID.isnull(),
    df_structure.normalID, df_structure.condoID)

where(), mask() Replace if condition is false and true respectively.

df_8.mask(df_8 > 5.0, 5.0)

Multiindexing

Turns columns A and B the the multiindex for the output dataframe.

df.set_index(['A','B'])

Remove multiindexing.

df.reset_index()

Remove an unwanted level

df.columns = df.columns.get_level_values('CEUDEndUse_res')

Dropping/deleting data

drop() column

df_parcels.drop(['index_right'], axis=1, inplace=True)

drop_duplicates()

s_parcelARNs_noDups = df_parcels_2a.PID_str.drop_duplicates(keep='first')

keep options: first, last, False

Plotting

df_buildings.boxplot(column=['area'],
                       by=['CATEGORY'], figsize=(12,12))
df_look['bsmtArea_sf'].plot(kind='hist', bins=40, title='bsmt. area',
        xlim=[0,max_footprint])
plt.show()
df_aptByParcel.plot.scatter(x='PhysDwPerBldg',
    y='numStoreys_avg', sharex=False)
df_aptByParcel.plot.hexbin(x='PhysDwPerBldg',
    y='numStoreys_avg', sharex=False, gridsize=25)

histograms, maps, KDEs, boxplot, etc FIXME

Pivottable.js

from pivottablejs import pivot_ui

url = 'http://servername.xyz.com:8889/files/city_share/'
filename = 'pivottable/df_nonRes.html'

pivot_ui(df_CAPEX_1,
        rows=['buildingID'],
        rowOrder='value_z_to_a',
        cols=['year'],
        vals=['capexInt_dlrPerSqft'],
        exclusions={'buildingID':['']},
        aggregatorName='Sum',
        rendererName='Heatmap',
        outfile_path=filename, url=url + filename)

documentation: parameters and python/jupyter

Plotly

import plotly.express as px
import plotly.io as pio

fig = px.scatter(pt_peo_adjFactors_bid_3,
                 x='electricity',
                 y='naturalGas',
                hover_name=pt_peo_adjFactors_bid_3.index,
                marginal_x='histogram',
                marginal_y='histogram')
fig.show()

filename = "pivottable/baseline_adjFactors_electricity-naturalGas.html"
pio.write_html(fig, file=filename)

Geo-processing data

overlay, centroid, area, crs transform, etc FIXME

sjoin()

df_parcels_1c = sjoin(df_parcels_1b, df_trafficZones_2, how="left")

Axis

It specifies the axis along which a function is computed. By default axis=0. This is consistent with the numpy.mean usage when axis is specified explicitly (in numpy.mean, axis==None by default, which computes the mean value over the flattened array) , in which axis=0 along the rows (namely, index in pandas), and axis=1 along the columns.

+------------+---------+--------+
|            |  A      |  B     |
+------------+---------+---------
|      0     | 0.626386| 1.52325|----axis=1----->
+------------+---------+--------+
                |         |
                | axis=0  |

Aggregating data

groupby() ref

gb_single = df_single.groupby(
    ['ID_TAZ','buildingClass', 'yearbuiltClass'], as_index=False)

df_singleUnits = gb_single.NosUnits.sum()

other groupby convenience functions: size(), mean(), first(), last(), nth()

Statistical models

OLS regression

from pandas.stats.api import ols

regr_1 = ols(y=df_apt['footprint_avg_sqft'], x=df_apt['PhysDwPerBldg'],
    intercept=False)

Strings

String concatenation

df_structure['propType'] = df_structure.PropertyCode + ' - ' + df_structure.propCodeDesc

String length

df_parcels.ROLLNO.str.len().value_counts()

String slice

df_parcels_2b['ARN'] = df_parcels_2b.ARN.str.slice(0,15)

where first and second indices are inclusive and exclusive respectively.

String (and number) formatting example

x = df_pop2016_0.value_total.sum()
print("{:,}".format(x))

formats number with commas (e.g. 29,232,957.0). See resource

Python data and control structures

traversal

for var in traversable_object:
    stmt ...

for a dictionary

for k,v in d_test.items():
    print(k)
    print(v)

or

for k in d_test:
    print(k)
    print(d_test[k])

dictionaries handy dictionary methods: get(), keys(), values()

lists handy list methods: append()

function

def function_name(params):
    stmt ...
    [return var]

Function used in DataFrame calculations with apply() example:

def getFootprintSizeClass(row):
    if (row['dwellType'] == 'single'):
        if (row['footprintClass'] == '(0, 500]'):
            return 'tiny'
        elif (
            (row['footprintClass'] == '(500, 750]') |
            (row['footprintClass'] == '(750, 1000]')):
            return 'small'
        elif (row['footprintClass'] == '(1000, 1500]'):
            return 'medium'
    elif (row['dwellType'] == 'double'):
        if (
            (row['footprintClass'] == '(0, 500]') |
            (row['footprintClass'] == '(500, 750]')):
            return 'small'            
        elif (
            (row['footprintClass'] == '(750, 1000]') |
            (row['footprintClass'] == '(1000, 1500]') |
            (row['footprintClass'] == '(1500, 10000]') |
            (row['footprintClass'] == '(10000, 100000]')):
            return 'large' 
    else:
        return

df_building['footprintSizeClass'] = df_building.apply(getFootprintSizeClass, axis=1)

Other useful Python syntax

increment integer

int_i += 1

Github

Prior to commit, strip ipynb notebook files with linux shell as per this blog post

nbstrip_jq myNotebook.ipynb > myNotebook_stripped.ipynb

And then

git status

git add <filename>
git add --all

git commit -m “description”
git push [origin master]

Get commit history

git reflog

Pull

git pull origin master

Handy Matrix & Vector Operations

shares / norm

each row sums to 1

df_shr = df_quantity.div(df_quantity.sum(axis='columns'), axis='index'))

each column sums to 1

df_shr = df_quantity.div(df_quantity.sum(axis='index'), axis='columns'))

Jupyter Notebook Markdown Tricks

Links

[Github](http://github.com)

Embed an image

![title](img/S_res_1.png)

Python Debugger (PDB)

ref

import pdb; pdb.set_trace()

n for next line; c to continue; s for step; exit for exit

Other functions

A

astype() - cast pandas Series from one dtype type to another.

df_propCodes['propType'] = df_propCodes['propCode'].astype(str) \
    + ' - ' + df_propCodes_1.propCodeDesc

types: str, int, float

also see: pd.to_numeric()

df ['elec'] = pd.to_numeric(df['elec'], errors='coerce')

C

cut() ref

footprintBins = [0, 500, 750, 1000, 1500, 10000, 100000]

df_building['footprintClass'] = pd.cut(
    df_building.footprintPerUnit, footprintBins)

more cut examples

concat() - combine blocks of records into a single DataFrame. Default axis is 0 (vertical); rows are added. See article on sort parameter

df_double = pd.concat([
    df_semi,
    df_duplex],
    sort=True,
    axis=0)

M

melt() ref

df_emissions_2 = pd.melt(df_emissions_1,
                         id_vars=['department','scope','fuelType'],
                         value_vars=['2016','2017','2018'],
                         var_name='Time',
                         value_name='Values')

Note that value_vars is optional. If omitted uses all columns that are not set as id_vars.

merge() ref

df_building_2 = pd.merge(df_building_1,
    df_parcel, left_on='ParcelID', right_on='PID', how='left')

Note that merge destroys GeoDataFrame attributes so they need to be reconstructed:

df_building_2 = gpd.GeoDataFrame(pd.merge(df_building_1,
    df_parcel, left_on='ParcelID', right_on='PID', how='left'))
df_building_2.crs = df_building_1.crs

map() ref

d_storeysClass = {
    0 : '1Story',
    1 : '1Story',
    1.5 : '2Story',
    2 : '2Story',
    3 : '3Story'
}
    
df_building['storeysClass'] = df_building.numStoreys.map(d_storeysClass)

modulo

x % y

gives the remainder after integer division.

R

rename()

df_structure.rename(columns={
    'oldName1' : 'newName1',
    'oldName2' : 'newName2'},
    inplace=True)

replace() - string replace

df_bldgEnergy_6b['buildingID'] = (df_bldgEnergy_6b.Site_Address
    .str.replace ('\s', '_', regex=True) #whitespace replace
    .str.replace ('.', '', regex=False)
    .str.replace (',', '_', regex=False)
    .str.replace ('&', 'and', regex=False)
    .str.replace ('[()]', '', regex=True) #brackets
    .str.replace ('/', '_', regex=False))

reset_index() - turn multi-index columns back into regular columns

df.reset_index()

S

s_strucCounts.sort_index()
s_strucCounts.sort_values()

df_a.sort_values('fieldX', ascending=False)

set() ref

Useful for list differences

set([1,2,3]) - set([2,3,4])

See other set operations including:

set1.union(set2)
set1.intersection(set2)

T

%time (once) and %timeit (multiple iterations) magic functions for execution runtime.

%time df_footprints = sjoin(df_footprints, df_parcels, how="left")
%timeit df_footprints = sjoin(df_footprints, df_parcels, how="left")

Z

zfill - pads string with leading 0s to reach specified string length

df_building_1a['PID'] = df_building_1a['PID'].str.zfill(8)