In [1]:
import pandas as pd
from collections import OrderedDict
In [2]:
stops_all = 'stops_all.csv'
In [3]:
# quick lambda function to zap
zapper = lambda x: ''.join(e.lower() for e in str(x) if e.isalnum())
In [4]:
stopsDF = pd.read_csv(stops_all,dtype=str).fillna('')
len(stopsDF)
Out[4]:
73969

Make a new column "zap" with zapped stop_name

In [5]:
stopsDF['zap'] = stopsDF['stop_name'].apply(zapper)
stopsDF[['stop_name','zap']].sample(10)
Out[5]:
stop_name zap
67596 Vengal Rao Building / ACB Office vengalraobuildingacboffice
14540 Gajula Ramaram X Road gajularamaramxroad
45274 Bandlaguda X Road bandlagudaxroad
47330 Punjagutta Colony punjaguttacolony
58965 Shalibanda shalibanda
4030 Ganesh Temple ganeshtemple
65511 Prashanth Nagar prashanthnagar
28897 Doddi doddi
41127 Eveready eveready
72358 Kukatpally kukatpally

make another dataframe with unique values of zap

In [6]:
uniqueNameDF = stopsDF[['zap']].drop_duplicates().sort_values('zap').copy().reset_index(drop=True)
print(len(uniqueNameDF))
uniqueNameDF.tail(10)
4452
Out[6]:
zap
4442 zia
4443 ziatalkiesmallepally
4444 zindatalismath
4445 zoopark
4446 zphighschool
4447 zrtc
4448 ztc
4449 ztcxroad
4450 zts
4451 ztsxroad

assign a 3-letter Alpha code to each zapped name

In [7]:
# from https://www.geeksforgeeks.org/ways-increment-character-python/
print("Characters range: A:{} to Z:{}".format(ord('A'),ord('Z')))
print(chr(65))
Characters range: A:65 to Z:90
A
In [8]:
counter = 0
breakout = False
for char1 in range(65,91):
    for char2 in range(65,91):
        for char3 in range(65,91):
            if counter < len(uniqueNameDF):
                uniqueNameDF.at[counter,'code'] = "{}{}{}".format(chr(char1),chr(char2),chr(char3))
                counter += 1
            else:
                breakout=True
                break
        if breakout: break
    if breakout: break

uniqueNameDF now has a code for each zapped name

In [9]:
uniqueNameDF.sample(10)
Out[9]:
zap code
2860 nmguda EGA
1569 iiitdlf CIJ
1308 gowlipura BYI
4014 tadbund FYK
546 bloodbank AVA
218 aocchowk AIK
3230 putlibowlikoti EUG
591 bothapallykaman AWT
466 begumpetrailwaystationgreenlands ARY
3175 prajaynivasprnapar ESD

Import these codes back to all stops DF

In [10]:
# left join in python from http://www.datasciencemadesimple.com/join-merge-data-frames-pandas-python/
stopsDF2 = pd.merge(stopsDF, uniqueNameDF, on='zap', how='left')
stopsDF2[['folder','jsonFile','direction_id','stop_name','zap','code']].sample(10)
Out[10]:
folder jsonFile direction_id stop_name zap code
38277 MHRM 458T_MHRM.json 0 City College citycollege BHD
31741 DSNR 524KR.json 0 High Court Colony highcourtcolony CDW
4347 HN2 290K_P.json 1 RTC Colony rtccolony FEG
58155 MPM 120M_118.json 0 Mahavir Hospital mahavirhospital DKU
48522 BGD 156B_118.json 1 Lakdikapul lakdikapul DER
58952 FQNR 8A.json 1 M.Tank Bund mtankbund DVW
4733 HN2 299N.json 0 Malakpet Chermas malakpetchermas DMD
71219 MSRD2 218.json 1 Bharath Nagar bharathnagar ASJ
24891 HCU 10H_U.json 0 Chinna Anjaiah Nagar chinnaanjaiahnagar BEF
23050 HPT 90L.json 1 LB Nagar X Road lbnagarxroad DHE

Now we create a collapsed table

Collapse along : folder + jsonFile + direction_id

In [11]:
def stringEmUp(x):
    a = OrderedDict({
        'pattern': ''.join(x['code'].tolist()),
        'len': len(x)
    })
    return pd.Series(a)
In [12]:
routeDF1 = stopsDF2[['folder','jsonFile','direction_id','code']].groupby(['folder','jsonFile','direction_id']).apply(stringEmUp)
routeDF1.head(10)
Out[12]:
pattern len
folder jsonFile direction_id
BGD 107V_R.json 0 FBWFBWBDOATMBTGDXEEWMANTDXEAYAGKOGKMGJIGJIFXOE... 51
1 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMFSGADKEWVCTBDGYB... 51
117L.json 0 APCBZQCKCCKDDQHCJYAGHAJZFVWDZMDNPDZKGFQGFEGFLB... 39
1 DEABOGBOGFSQDEBFSRGEZAIQCCYEGVAAADNGDNDCNOBLMC... 39
156B_118.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMEDYDUJBHOFLEDANA... 43
1 BYTBSIGAGCWVCFQFMYAXNGCUFHQEBDDRKFKFEFWDPUEPED... 43
156_299.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMEDYDUJBHOFLEBEED... 34
1 DRKFKEEFWDPUBWFDKSDPUAAWDERDERDERALGEAPEALEAXE... 51
158J_L.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMFSGADKEWVCTBDHED... 76
1 COUCOWFOEBSYCFCCEZBFRBEXBUDBLQGLMCBLGGEAOHEFJC... 70

bring the folder, jsonFile, direction_id values back into a flat table

In [13]:
routeDF2 = routeDF1.reset_index()
routeDF2.head(10)
Out[13]:
folder jsonFile direction_id pattern len
0 BGD 107V_R.json 0 FBWFBWBDOATMBTGDXEEWMANTDXEAYAGKOGKMGJIGJIFXOE... 51
1 BGD 107V_R.json 1 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMFSGADKEWVCTBDGYB... 51
2 BGD 117L.json 0 APCBZQCKCCKDDQHCJYAGHAJZFVWDZMDNPDZKGFQGFEGFLB... 39
3 BGD 117L.json 1 DEABOGBOGFSQDEBFSRGEZAIQCCYEGVAAADNGDNDCNOBLMC... 39
4 BGD 156B_118.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMEDYDUJBHOFLEDANA... 43
5 BGD 156B_118.json 1 BYTBSIGAGCWVCFQFMYAXNGCUFHQEBDDRKFKFEFWDPUEPED... 43
6 BGD 156_299.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMEDYDUJBHOFLEBEED... 34
7 BGD 156_299.json 1 DRKFKEEFWDPUBWFDKSDPUAAWDERDERDERALGEAPEALEAXE... 51
8 BGD 158J_L.json 0 APCBZQDQHCKDCJWAGHAPEAJZFVWDZMFSGADKEWVCTBDHED... 76
9 BGD 158J_L.json 1 COUCOWFOEBSYCFCCEZBFRBEXBUDBLQGLMCBLGGEAOHEFJC... 70

confirm that the pattern length is len times 3

In [14]:
print(len(routeDF2.at[0,'pattern'])/3 )

print(len(routeDF2.at[6,'pattern'])/3 )

# confirmed
51.0
34.0

output this to route_patterns.csv

In [15]:
routeDF2.to_csv('route_patterns.csv',index_label='sr')

oh yes got to output the stopsDF too with the zap and code values

In [16]:
stopsDF2.to_csv('stops_all_coded.csv',index_label='sr')
In [17]:
'#'*70
Out[17]:
'######################################################################'

Extra : Data integrity test : Can we reconstruct a route from a pattern?

In [18]:
trip = routeDF2.iloc[45]
trip
Out[18]:
folder                                                        BGD
jsonFile                                             217_300.json
direction_id                                                    0
pattern         APCBZQCKCCKDDQHCJYAGHAJZFVWDZMDNPDZKGFQFJPDNQD...
len                                                            79
Name: 45, dtype: object
In [19]:
int(len(trip.pattern)/3)
Out[19]:
79

make a uniqueNameDF2 with code as index so lookup becomes easy

In [20]:
uniqueNameDF2 = uniqueNameDF.set_index('code',drop=True)
uniqueNameDF2.sample(10)
Out[20]:
zap
code
AIQ aphbcolony
FEB rspally
BWC gnagarxroad
GIZ venkateswaracolony
DJQ madhapurimagegradens
GEP udamgadda
FPW shivareddyguda
BBL channapuram
FFZ saibabacolony
AAZ acharyangrangaagriculturaluniversity
In [21]:
pattern = trip.pattern
In [22]:
for N in range(79):
    startpos = N*3
    endpos = startpos + 3
    code = pattern[startpos:endpos]
    print(code, uniqueNameDF2.at[code,'zap'])
APC bandlagudadepot
BZQ gsigate
CKC induaranya
CKD induaranyapallavi
DQH mathuguda
CJY indraprastacolony
AGH anandnagar
AJZ arunodayacolony
FVW sujathahotel
DZM nagolexroad
DNP mamathanagar
DZK nagolemetrostation
GFQ uppalxroad
FJP saraswathinagar
DNQ mamathanagarcolony
DZI nagolebusstand
FSD snehapuricolony
ADI alkapuri
EWV rajeevgandhinagar
DPG mansoorabadkaminenihospital
AZE centralbankcolony
DHC lbnagarringroad
FFV sagarxroad
BVD gayatrinagarxroad
DOG mandamallamma
EJS owaisihospital
EPN pisalbanda
BMS drdl
EOU pgadasl
CBF hafizbabanagar
AMN babanagar
BLX dlrl
CQX junction
CWD keshavgiri
DGT laxmiguda
APA bandlaguda
GEP udamgadda
DLJ mailardevpally
BNN durganagar
FPN shivampally
AMS babulreddynagar
AAI aaramghar
AJJ aramgharmehidipatnam
FQD shivrampallyxroad
FPV shivarampallyquarters
BIO dairyfarm
CBV happyhome
GFV upperpallyxroad
FCF ringroad
CGN hyderguda
GAE tejaswinagar
ALO attapur
FCF ringroad
CRB jyothinagar
DPT maruthinagar
DFH lakshminagar
FBU rethibowli
DRK mehdipatnam
EBD nanalnagar
FHQ salarjungcolony
GCU tolichowki
AXN brundavancolony
FMY shaikpetdarga
CFQ hsdargha
CWV khajaguda
GAG telecomnagar
BSI gachibowlibusstation
BYT gpraqtrs
CDL hcu
DIE lingampallyrailwaystation
AKQ ashoknagarlingampally
ATA bhel
EMJ patancheru
EQD pocharam
EIV orr
DXO muttangi
CLI isnapurxroad
DEM lakdaram
FES rudraram
In [23]:
# now fetch the same trip from orig stops data
stopsDF2[(stopsDF2.folder == trip.folder) & (stopsDF2.jsonFile==trip.jsonFile) &\
    (stopsDF2.direction_id==trip.direction_id)][['folder','jsonFile','direction_id','stop_name','code','zap']]
Out[23]:
folder jsonFile direction_id stop_name code zap
46367 BGD 217_300.json 0 Bandlaguda Depot APC bandlagudadepot
46368 BGD 217_300.json 0 GSI Gate BZQ gsigate
46369 BGD 217_300.json 0 Indu Aranya CKC induaranya
46370 BGD 217_300.json 0 Indu Aranya Pallavi CKD induaranyapallavi
46371 BGD 217_300.json 0 Mathuguda DQH mathuguda
46372 BGD 217_300.json 0 Indra Prasta Colony CJY indraprastacolony
46373 BGD 217_300.json 0 Anand Nagar AGH anandnagar
46374 BGD 217_300.json 0 Arunodaya Colony AJZ arunodayacolony
46375 BGD 217_300.json 0 Sujatha Hotel FVW sujathahotel
46376 BGD 217_300.json 0 Nagole X Road DZM nagolexroad
46377 BGD 217_300.json 0 Mamatha Nagar DNP mamathanagar
46378 BGD 217_300.json 0 Nagole Metro Station DZK nagolemetrostation
46379 BGD 217_300.json 0 Uppal X Road GFQ uppalxroad
46380 BGD 217_300.json 0 Saraswathi Nagar FJP saraswathinagar
46381 BGD 217_300.json 0 Mamatha Nagar Colony DNQ mamathanagarcolony
46382 BGD 217_300.json 0 Nagole Bus Stand DZI nagolebusstand
46383 BGD 217_300.json 0 Snehapuri Colony FSD snehapuricolony
46384 BGD 217_300.json 0 Alkapuri ADI alkapuri
46385 BGD 217_300.json 0 Rajeev Gandhi Nagar EWV rajeevgandhinagar
46386 BGD 217_300.json 0 Mansoorabad Kamineni Hospital DPG mansoorabadkaminenihospital
46387 BGD 217_300.json 0 Central Bank Colony AZE centralbankcolony
46388 BGD 217_300.json 0 LB Nagar Ring Road DHC lbnagarringroad
46389 BGD 217_300.json 0 Sagar X Road FFV sagarxroad
46390 BGD 217_300.json 0 Gayatri Nagar X Road BVD gayatrinagarxroad
46391 BGD 217_300.json 0 Manda Mallamma DOG mandamallamma
46392 BGD 217_300.json 0 Owaisi Hospital EJS owaisihospital
46393 BGD 217_300.json 0 Pisalbanda EPN pisalbanda
46394 BGD 217_300.json 0 DRDL BMS drdl
46395 BGD 217_300.json 0 Pgad / ASL EOU pgadasl
46396 BGD 217_300.json 0 Hafiz Baba Nagar CBF hafizbabanagar
... ... ... ... ... ... ...
46416 BGD 217_300.json 0 Hyderguda CGN hyderguda
46417 BGD 217_300.json 0 Tejaswi Nagar GAE tejaswinagar
46418 BGD 217_300.json 0 Attapur ALO attapur
46419 BGD 217_300.json 0 Ring Road FCF ringroad
46420 BGD 217_300.json 0 Jyothi Nagar CRB jyothinagar
46421 BGD 217_300.json 0 Maruthi Nagar DPT maruthinagar
46422 BGD 217_300.json 0 Lakshmi Nagar DFH lakshminagar
46423 BGD 217_300.json 0 Rethibowli FBU rethibowli
46424 BGD 217_300.json 0 Mehdipatnam DRK mehdipatnam
46425 BGD 217_300.json 0 Nanal Nagar EBD nanalnagar
46426 BGD 217_300.json 0 Salarjung Colony FHQ salarjungcolony
46427 BGD 217_300.json 0 Toli Chowki GCU tolichowki
46428 BGD 217_300.json 0 Brundavan Colony AXN brundavancolony
46429 BGD 217_300.json 0 Shaikpet Darga FMY shaikpetdarga
46430 BGD 217_300.json 0 HS Dargha CFQ hsdargha
46431 BGD 217_300.json 0 Khajaguda CWV khajaguda
46432 BGD 217_300.json 0 Telecom Nagar GAG telecomnagar
46433 BGD 217_300.json 0 Gachibowli Bus Station BSI gachibowlibusstation
46434 BGD 217_300.json 0 Gpra Qtrs BYT gpraqtrs
46435 BGD 217_300.json 0 HCU CDL hcu
46436 BGD 217_300.json 0 Lingampally Railway Station DIE lingampallyrailwaystation
46437 BGD 217_300.json 0 Ashok Nagar / Lingampally AKQ ashoknagarlingampally
46438 BGD 217_300.json 0 BHEL ATA bhel
46439 BGD 217_300.json 0 Patancheru EMJ patancheru
46440 BGD 217_300.json 0 Pocharam EQD pocharam
46441 BGD 217_300.json 0 ORR EIV orr
46442 BGD 217_300.json 0 Muttangi DXO muttangi
46443 BGD 217_300.json 0 Isnapur X Road CLI isnapurxroad
46444 BGD 217_300.json 0 Lakdaram DEM lakdaram
46445 BGD 217_300.json 0 Rudraram FES rudraram

79 rows × 6 columns

YESS we have data integrity

The zapped names from pattern and the original route's zap column are the same