import pandas as pd
from collections import OrderedDict
stops_all = 'stops_all.csv'
# quick lambda function to zap
zapper = lambda x: ''.join(e.lower() for e in str(x) if e.isalnum())
stopsDF = pd.read_csv(stops_all,dtype=str).fillna('')
len(stopsDF)
stopsDF['zap'] = stopsDF['stop_name'].apply(zapper)
stopsDF[['stop_name','zap']].sample(10)
uniqueNameDF = stopsDF[['zap']].drop_duplicates().sort_values('zap').copy().reset_index(drop=True)
print(len(uniqueNameDF))
uniqueNameDF.tail(10)
# from https://www.geeksforgeeks.org/ways-increment-character-python/
print("Characters range: A:{} to Z:{}".format(ord('A'),ord('Z')))
print(chr(65))
counter = 0
breakout = False
for char1 in range(65,91):
for char2 in range(65,91):
for char3 in range(65,91):
if counter < len(uniqueNameDF):
uniqueNameDF.at[counter,'code'] = "{}{}{}".format(chr(char1),chr(char2),chr(char3))
counter += 1
else:
breakout=True
break
if breakout: break
if breakout: break
uniqueNameDF.sample(10)
# left join in python from http://www.datasciencemadesimple.com/join-merge-data-frames-pandas-python/
stopsDF2 = pd.merge(stopsDF, uniqueNameDF, on='zap', how='left')
stopsDF2[['folder','jsonFile','direction_id','stop_name','zap','code']].sample(10)
Collapse along : folder + jsonFile + direction_id
def stringEmUp(x):
a = OrderedDict({
'pattern': ''.join(x['code'].tolist()),
'len': len(x)
})
return pd.Series(a)
routeDF1 = stopsDF2[['folder','jsonFile','direction_id','code']].groupby(['folder','jsonFile','direction_id']).apply(stringEmUp)
routeDF1.head(10)
routeDF2 = routeDF1.reset_index()
routeDF2.head(10)
print(len(routeDF2.at[0,'pattern'])/3 )
print(len(routeDF2.at[6,'pattern'])/3 )
# confirmed
routeDF2.to_csv('route_patterns.csv',index_label='sr')
stopsDF2.to_csv('stops_all_coded.csv',index_label='sr')
'#'*70
trip = routeDF2.iloc[45]
trip
int(len(trip.pattern)/3)
uniqueNameDF2 = uniqueNameDF.set_index('code',drop=True)
uniqueNameDF2.sample(10)
pattern = trip.pattern
for N in range(79):
startpos = N*3
endpos = startpos + 3
code = pattern[startpos:endpos]
print(code, uniqueNameDF2.at[code,'zap'])
# now fetch the same trip from orig stops data
stopsDF2[(stopsDF2.folder == trip.folder) & (stopsDF2.jsonFile==trip.jsonFile) &\
(stopsDF2.direction_id==trip.direction_id)][['folder','jsonFile','direction_id','stop_name','code','zap']]
The zapped names from pattern and the original route's zap column are the same