A “cheat sheet” of commonly used Python data tools, control structures and their syntax.
Full pandas documentation.
Python 2.7 documentation.
# single line comment ''' multi line comments '''
import geopandas as gpd from geopandas.tools import sjoin from geopandas.tools import overlay import pandas as pd from pandas.stats.api import ols import numpy as np import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import StringIO import os import sys
Note that on Anaconda, the typical path for packages is something like:
/opt/conda/envs/myenv/lib/python3.6/site-packages
read and write: csv, excel, shapefile, etc
filename = communityName + '\\output\\df_nonRes_bldgs.csv'
df_general = pd.read_csv(filename)
specify a single worksheet to read
df_tripsOD = pd.read_excel(filename, sheet_name='HBS_24')
specify multiple worksheets to read
l_purps = ['HBS-24','HBW-24','HBO-24','NHB-24'] d_df_tripsOD = pd.read_excel(filename, sheet_name=l_purps)
df_nonRes_bldgs.to_csv(filename, index=False)
df_structure.to_excel(filename)
df_parcels_0 = gpd.GeoDataFrame.from_file(myParcels.shp)
df_structure = gpd.read_file(myLayers.gdb, layer='structure')
Round trip export/import for json files, as Python dictionaries or lists. Requires import json
.
filename = 'folderX/list_CEUDSector_ind.json' with open(filename, 'w', encoding='utf-8') as f: json.dump(l_CEUDSector_ind, f, ensure_ascii=False, indent=4)
filename = 'folderX/list_CEUDSector_ind.json' with open(filename) as data_file: l_CEUDSector_ind= json.load(data_file)
df_buildings.info()
option null_counts=True
To see the individual cell types
df_bld_0.DFRPPropertyNumber.map(type)
df_buildings.describe()
df_buildings.head() df_buildings.tail() df_buildings.sample(10)
df_buildings.StrucType.value_counts()
pt_structure = pd.pivot_table(df_structure, values='OBJECTID', index=['propType'], columns=['strucType'], margins=False, aggfunc=np.sum)
ref |
Common aggfunc params: len
, np.sum
, np.mean
df_buildings.StrucType.unique() df_buildings.StrucType.nunique()
Selecting rows
df_farm = df_structure[ (df_structure.PropertyCode >= 200) & (df_structure.PropertyCode < 300)]
Can use bitwise operators: AND &
, OR |
, NOT ~
Other boolean operators: ==
, >=
, >
, ⇐
, <
,!=
, isin(myList)
, isnull()
, notnull()
Selecting columns
df_condosLUT = df_condos.loc[:,['RollNumber','CondoRollN']]
df_buildings.loc[df_buildings.UNIQUEID == '6554354', 'ASMTCODE'] = '303 - Warehouses (Including Cold Storage Plants)'
fillna() ref
df_building['YearBuilt'].fillna(2011, inplace=True)
Conditional if-then-else column-wise logic
df_structure['finalID'] = np.where(df_structure.condoID.isnull(), df_structure.normalID, df_structure.condoID)
where(), mask() Replace if condition is false and true respectively.
df_8.mask(df_8 > 5.0, 5.0)
Turns columns A and B the the multiindex for the output dataframe.
df.set_index(['A','B'])
Remove multiindexing.
df.reset_index()
Remove an unwanted level
df.columns = df.columns.get_level_values('CEUDEndUse_res')
drop() column
df_parcels.drop(['index_right'], axis=1, inplace=True)
drop_duplicates()
s_parcelARNs_noDups = df_parcels_2a.PID_str.drop_duplicates(keep='first')
keep options: first
, last
, False
df_buildings.boxplot(column=['area'], by=['CATEGORY'], figsize=(12,12))
df_look['bsmtArea_sf'].plot(kind='hist', bins=40, title='bsmt. area', xlim=[0,max_footprint]) plt.show()
df_aptByParcel.plot.scatter(x='PhysDwPerBldg', y='numStoreys_avg', sharex=False)
df_aptByParcel.plot.hexbin(x='PhysDwPerBldg', y='numStoreys_avg', sharex=False, gridsize=25)
histograms, maps, KDEs, boxplot, etc
from pivottablejs import pivot_ui url = 'http://servername.xyz.com:8889/files/city_share/' filename = 'pivottable/df_nonRes.html' pivot_ui(df_CAPEX_1, rows=['buildingID'], rowOrder='value_z_to_a', cols=['year'], vals=['capexInt_dlrPerSqft'], exclusions={'buildingID':['']}, aggregatorName='Sum', rendererName='Heatmap', outfile_path=filename, url=url + filename)
documentation: parameters and python/jupyter
import plotly.express as px import plotly.io as pio fig = px.scatter(pt_peo_adjFactors_bid_3, x='electricity', y='naturalGas', hover_name=pt_peo_adjFactors_bid_3.index, marginal_x='histogram', marginal_y='histogram') fig.show() filename = "pivottable/baseline_adjFactors_electricity-naturalGas.html" pio.write_html(fig, file=filename)
overlay, centroid, area, crs transform, etc
sjoin()
df_parcels_1c = sjoin(df_parcels_1b, df_trafficZones_2, how="left")
It specifies the axis along which a function is computed. By default axis=0. This is consistent with the numpy.mean usage when axis is specified explicitly (in numpy.mean, axis==None by default, which computes the mean value over the flattened array) , in which axis=0 along the rows (namely, index in pandas), and axis=1 along the columns.
+------------+---------+--------+ | | A | B | +------------+---------+--------- | 0 | 0.626386| 1.52325|----axis=1-----> +------------+---------+--------+ | | | axis=0 |
groupby() ref
gb_single = df_single.groupby( ['ID_TAZ','buildingClass', 'yearbuiltClass'], as_index=False) df_singleUnits = gb_single.NosUnits.sum()
other groupby convenience functions: size()
, mean()
, first()
, last()
, nth()
OLS regression
from pandas.stats.api import ols regr_1 = ols(y=df_apt['footprint_avg_sqft'], x=df_apt['PhysDwPerBldg'], intercept=False)
String concatenation
df_structure['propType'] = df_structure.PropertyCode + ' - ' + df_structure.propCodeDesc
String length
df_parcels.ROLLNO.str.len().value_counts()
String slice
df_parcels_2b['ARN'] = df_parcels_2b.ARN.str.slice(0,15)
where first and second indices are inclusive and exclusive respectively.
String (and number) formatting example
x = df_pop2016_0.value_total.sum() print("{:,}".format(x))
formats number with commas (e.g. 29,232,957.0
). See resource
traversal
for var in traversable_object: stmt ...
for a dictionary
for k,v in d_test.items(): print(k) print(v)
or
for k in d_test: print(k) print(d_test[k])
dictionaries
handy dictionary methods: get()
, keys()
, values()
lists
handy list methods: append()
function
def function_name(params): stmt ... [return var]
Function used in DataFrame calculations with apply() example:
def getFootprintSizeClass(row): if (row['dwellType'] == 'single'): if (row['footprintClass'] == '(0, 500]'): return 'tiny' elif ( (row['footprintClass'] == '(500, 750]') | (row['footprintClass'] == '(750, 1000]')): return 'small' elif (row['footprintClass'] == '(1000, 1500]'): return 'medium' elif (row['dwellType'] == 'double'): if ( (row['footprintClass'] == '(0, 500]') | (row['footprintClass'] == '(500, 750]')): return 'small' elif ( (row['footprintClass'] == '(750, 1000]') | (row['footprintClass'] == '(1000, 1500]') | (row['footprintClass'] == '(1500, 10000]') | (row['footprintClass'] == '(10000, 100000]')): return 'large' else: return df_building['footprintSizeClass'] = df_building.apply(getFootprintSizeClass, axis=1)
increment integer
int_i += 1
Prior to commit, strip ipynb notebook files with linux shell as per this blog post
nbstrip_jq myNotebook.ipynb > myNotebook_stripped.ipynb
And then
git status git add <filename> git add --all git commit -m “description” git push [origin master]
Get commit history
git reflog
Pull
git pull origin master
shares / norm
each row sums to 1
df_shr = df_quantity.div(df_quantity.sum(axis='columns'), axis='index'))
each column sums to 1
df_shr = df_quantity.div(df_quantity.sum(axis='index'), axis='columns'))
Links
[Github](http://github.com)
Embed an image
![title](img/S_res_1.png)
astype() - cast pandas Series from one dtype type to another.
df_propCodes['propType'] = df_propCodes['propCode'].astype(str) \ + ' - ' + df_propCodes_1.propCodeDesc
types: str
, int
, float
also see: pd.to_numeric()
df ['elec'] = pd.to_numeric(df['elec'], errors='coerce')
cut() ref
footprintBins = [0, 500, 750, 1000, 1500, 10000, 100000] df_building['footprintClass'] = pd.cut( df_building.footprintPerUnit, footprintBins)
more cut examples
concat() - combine blocks of records into a single DataFrame. Default axis is 0 (vertical); rows are added. See article on sort parameter
df_double = pd.concat([ df_semi, df_duplex], sort=True, axis=0)
melt() ref
df_emissions_2 = pd.melt(df_emissions_1, id_vars=['department','scope','fuelType'], value_vars=['2016','2017','2018'], var_name='Time', value_name='Values')
Note that value_vars is optional. If omitted uses all columns that are not set as id_vars.
merge() ref
df_building_2 = pd.merge(df_building_1, df_parcel, left_on='ParcelID', right_on='PID', how='left')
Note that merge destroys GeoDataFrame attributes so they need to be reconstructed:
df_building_2 = gpd.GeoDataFrame(pd.merge(df_building_1, df_parcel, left_on='ParcelID', right_on='PID', how='left')) df_building_2.crs = df_building_1.crs
map() ref
d_storeysClass = { 0 : '1Story', 1 : '1Story', 1.5 : '2Story', 2 : '2Story', 3 : '3Story' } df_building['storeysClass'] = df_building.numStoreys.map(d_storeysClass)
modulo
x % y
gives the remainder after integer division.
rename()
df_structure.rename(columns={ 'oldName1' : 'newName1', 'oldName2' : 'newName2'}, inplace=True)
replace() - string replace
df_bldgEnergy_6b['buildingID'] = (df_bldgEnergy_6b.Site_Address .str.replace ('\s', '_', regex=True) #whitespace replace .str.replace ('.', '', regex=False) .str.replace (',', '_', regex=False) .str.replace ('&', 'and', regex=False) .str.replace ('[()]', '', regex=True) #brackets .str.replace ('/', '_', regex=False))
reset_index() - turn multi-index columns back into regular columns
df.reset_index()
s_strucCounts.sort_index() s_strucCounts.sort_values() df_a.sort_values('fieldX', ascending=False)
set() ref
Useful for list differences
set([1,2,3]) - set([2,3,4])
See other set operations including:
set1.union(set2) set1.intersection(set2)
%time (once) and %timeit (multiple iterations) magic functions for execution runtime.
%time df_footprints = sjoin(df_footprints, df_parcels, how="left") %timeit df_footprints = sjoin(df_footprints, df_parcels, how="left")
zfill - pads string with leading 0s to reach specified string length
df_building_1a['PID'] = df_building_1a['PID'].str.zfill(8)