## GTFS feed from in-progress Hyderbad bus route mapping

Steps:  
- make it scan first done/ routes then data/
- skip stops that haven't been mapped, but process the whole route regardless
- build stops DB as each route is processed 
- don't bother with stop names : if there's an existing stop of same lat-long in stops DB, take its stop_id.
- if time offsets are absent in route sequence, assume a 2-min offset per stop on the sequence.
- if timings for that route are missing, then assume one trip in each direction starting at 10am.
- stop_times: Append to stop_times.txt at end of each route's loop instead of collecting and waiting till the end, as this gets very large and program hangs half way through

In [1]:
import pandas as pd
from collections import OrderedDict
import time, datetime
import os
import re
import requests
from bs4 import BeautifulSoup

In [2]:
seqFolder = 'done/'
seqFolder2 = 'data/'
timeFolder = 'start_times/'
outputFolder = 'gtfs/'
CalendarServiceID = 'ALL'

## Make agency, calendar, feed_info files
Get the pre-set things done

In [3]:
agency = """agency_id,agency_name,agency_url,agency_timezone
TSRTC_HYD,Telangana State Road Transport Corporation (Hyderabad local bus),http://tsrtconline.in,Asia/Kolkata
"""
with open(outputFolder+'agency.txt','w+') as f:
    f.write(agency)

In [4]:
feed_info = """feed_publisher_name,feed_publisher_url,feed_lang,feed_version
Telangana Open Data Portal,http://www.data.telangana.gov.in/,en,2018.09.30
"""
with open(outputFolder+'feed_info.txt','w+') as f:
    f.write(feed_info)

In [5]:
calendar = """service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
ALL,1,1,1,1,1,1,1,20180901,20990101
"""
with open(outputFolder+'calendar.txt','w+') as f:
    f.write(calendar)

## Functions

In [6]:
zapper = lambda x: ''.join(e.upper() for e in x if e.isalnum())

def findFiles(folder, ext='.csv', prefix=None):
	filenames = [f for f in os.listdir(folder) 
		if f.lower().endswith(ext) 
		and ( checkPrefix(f,prefix) ) 
		and os.path.isfile(os.path.join(folder, f))]
	return filenames

def checkPrefix(f,prefix):
	if not prefix: return True
	return f.lower().startswith(prefix)


In [7]:
def fetchTitle(route):
    # fetch title
    url = 'http://www.hyderabadcitybus.in/route-no/{}/'.format(route)
    #print('looking up for title, url:',url)
    r = requests.get(url)
    soup =  BeautifulSoup(r.content,'lxml')
    
    bigtitleHolder = soup.select_one('#onwards-route')
    if bigtitleHolder:
        bigtitle = bigtitleHolder.text
        title = bigtitle[bigtitle.find("(")+1 : bigtitle.find(")")].replace(' â†’','')
        # get (text in parenthesis), from https://stackoverflow.com/a/4894156/4355695
    else: title = 'route ' + route
    # if can't find it online, just prefix with 'route '
    
    return title

In [8]:
def calcTripID(row,timerow):
    # trip_id generation
    # take like: 219_0501, and if direction present then 219_0501_0
    timestr = timerow.start_time[:5].replace(':','')
    trip_id = '{}_{}'.format(row['route_id'],timestr)
    if row['direction_id']:                         
        trip_id += '_{}'.format(row['direction_id'])
    return trip_id

In [9]:
def calcTimeOffset(start_time, offset):
    timeList = [ int(x) for x in start_time.split(':')]
    mm = (timeList[1] + int(offset)) % 60
    hh = timeList[0] + ( (timeList[1] + int(offset)) // 60 )
    
    hh = str(hh).rjust(2,'0')
    mm = str(mm).rjust(2,'0')
    
    return '{}:{}:00'.format(hh,mm)
    
# test:
calcTimeOffset('3:40:00',26)

'04:06:00'

In [10]:
def calcStopID(row):
    global stopsDB 
    # informing the function to use global var, don't make a local one. from https://stackoverflow.com/a/423596
    
    # determining stop_id:
    suffix = 1
    uniqueFlag = False
    while uniqueFlag is False:
        stop_id = zapper(row.stop_name)[:5].ljust(5,'0') + str(suffix)
        if len(stopsDB): uniqueFlag = len(stopsDB[ stopsDB.stop_id == stop_id]) < 1
        else: uniqueFlag = True
        suffix += 1

    lat = row.stop_lat
    lon = row.stop_lon
    addFlag = True
    
    # finding a match by lat-long, no name/id funny business!
    if len(stopsDB) and lat and lon:
        matchingLoc = stopsDB[ (stopsDB.stop_lat == lat) & (stopsDB.stop_lon == lon)].copy().reset_index(drop=True)
        if len(matchingLoc):
            matchNamesZapped = [ zapper(x) for x in matchingLoc.stop_name.tolist() ]
            # print('location matches with:',matchNamesZapped)
            
            if zapper(row.stop_name) in matchNamesZapped:
                # matching lat-long and name. So take this stop_id
                # print('matching entry found. Skip adding to stopsDB.')
                stop_id = matchingLoc.iloc[ matchNamesZapped.index(zapper(row.stop_name)) ].stop_id
                # retrieve pos of the matching name, using that, recover the stop_id for it from matchingLoc
                # this may become redundant if we decide to go with one stop_id only per lat-long
                addFlag=False
            else:
                # matching lat-long but different name.
                # Edit: Don't create a new stop.
                # use the first stop_id from the matches
                stop_id = matchingLoc.iloc[0].stop_id
                addFlag=False
                # print('{}: taking existing stop_id {} whose location is matching.'\
                #     .format(row.get('stop_name'),stop_id))

    # adding to stopsDB
    if addFlag: 
        stopRow = {'stop_id':stop_id, 'stop_name':row.stop_name, 'stop_lat':lat, 'stop_lon':lon}
        
        stop_desc = row.get('stop_desc','')
        if stop_desc and (stop_desc.lower() != row.stop_name.lower()): 
            stopRow['stop_desc'] = stop_desc
        stopsDB = stopsDB.append(stopRow, ignore_index=True)
        # print(stop_id,'added.')
    return stop_id

In [11]:
zapper('yo')[:5].ljust(5,'0')

'YO000'

In [12]:
if '': print('yay')
else: print('nay')

nay


## load both sequence and timing csv at same time, and process.

In [13]:
stopsDB = pd.DataFrame(columns=['stop_id','stop_name','stop_lat','stop_lon','stop_desc'])
routesCollector = []
tripsCollector = []
stoptimesCollector = []
mainLoopCount = 0

In [14]:
# generic timeDF
pd.DataFrame({'direction':[0,1], 'start_time':'10:00:00'})

Unnamed: 0,direction,start_time
0,0,10:00:00
1,1,10:00:00


### merge the done/ and data/ csv's in a list

In [15]:
allSequencesList = [ seqFolder + x for x in findFiles(seqFolder)] + [ seqFolder2 + x for x in findFiles(seqFolder2)]
allSequencesList[:20]

['done/120M.csv',
 'done/18B.csv',
 'done/195J.csv',
 'done/19M.csv',
 'done/218D.csv',
 'done/219.csv',
 'done/225.csv',
 'done/227B.csv',
 'done/229.csv',
 'done/22T.csv',
 'done/242G.csv',
 'done/3JJ.csv',
 'data/102B_218L.csv',
 'data/102C.csv',
 'data/102CJ.csv',
 'data/102D.csv',
 'data/102F.csv',
 'data/102H.csv',
 'data/102K.csv',
 'data/102N.csv']

In [16]:
# how to get just the filename
allSequencesList[0].split('/')[-1]

'120M.csv'

### now main loop

In [17]:
for csvFilePath in allSequencesList:
    if mainLoopCount % 20 == 0 : print('\n\n{} routes processed, {} stops collected.\n\n'.format(mainLoopCount,len(stopsDB) ) )
        
    csvFile = csvFilePath.split('/')[-1] # getting the filename again out of the full path
    print('Processing {}'.format(csvFile))
    
    # loading sequence:
    seqDF = pd.read_csv(csvFilePath, dtype=str).fillna('')
    # drop the entries where lat or long hasn't been set yet
    seqDF = seqDF[ ((seqDF.stop_lat.str != '') & (seqDF.stop_lon != '')) ].reset_index(drop=True)
    
    # to do: idiot-proofing : detect string values in lat, long
    
    # make lat-longs as float, rounded to 5 decimals
    seqDF['stop_lat'] = seqDF.stop_lat.apply( lambda x: round(float(x),5) if len(x) else '' )
    seqDF['stop_lon'] = seqDF.stop_lon.apply( lambda x: round(float(x),5) if len(x) else '' )
    
    
    # route
    route = seqDF.route.iloc[0]
    route_id = csvFile[:-4] # let's not zap it, if its a filename then its valid
    routeRow = OrderedDict({
        'route_id': route_id,
        'route_short_name': route,
        'route_long_name': fetchTitle(route),
        'route_type': '3' # 3:bus
    })
    # fetchTitle(route) : outsourced work to a function
    routesCollector.append(routeRow)
    
    # stops
    # create a stop_id column
    seqDF['stop_id'] = seqDF.apply(lambda row: calcStopID(row), axis=1)
    # and that should create the stopsDB, externally
    # print(seqDF.stop_id.tolist())
    
    
    # load start times
    try:
        timeDF = pd.read_csv(timeFolder+csvFile, dtype=str)\
            .drop_duplicates(['direction','start_time'])
        # some csv's have duplicate entries so dropping duplicates
    except FileNotFoundError as e:
        print('For route {}, no start times data found so using generic template instead, one trip at 10am in each direction.'.format(route_id))
        timeDF = pd.DataFrame({'direction':['0','1'], 'start_time':'10:00:00'})
        
    # trips
    for n,timerow in timeDF.iterrows():
        row = OrderedDict({
            'route_id': route_id,
            'service_id': CalendarServiceID,
            'direction_id': timerow.direction 
        })
        row['trip_id'] = calcTripID(row,timerow) # outsourced work to a function
        tripsCollector.append(row)
        
        # stop_times , within trips loop
        # now loop through each entry in seqDF:
        # hey, filter by direction id!
        seqTripDF = seqDF[seqDF.direction == row['direction_id']].copy().reset_index(drop=True)
    
        for sequence,seqRow in seqTripDF.iterrows():
            timing = calcTimeOffset(timerow.start_time, seqRow.get('offset',2*sequence))
            # offset: in case its absent, take default as 2 mins
            stRow = OrderedDict({
                'trip_id': row['trip_id'],
                'arrival_time': timing,
                'departure_time': timing,
                'stop_id': seqRow.stop_id,
                'stop_sequence': sequence+1,
                'timepoint': '0'
            })
            stoptimesCollector.append(stRow)
        # print('stop_times: added for trip {}, last timing: {}'.format( stRow['trip_id'],stRow['arrival_time'] )  )

    # incrementally writing to and saving stop_times.txt
    if not mainLoopCount:
        # first time:
        pd.DataFrame(stoptimesCollector).to_csv(outputFolder+'stop_times.txt',index=False)
    else:
        # append
        pd.DataFrame(stoptimesCollector).to_csv(outputFolder+'stop_times.txt',index=False, mode='a',header=False)
    del stoptimesCollector
    stoptimesCollector = []
    
    mainLoopCount += 1
    # print('completed loop for route',route_id)



0 routes processed, 0 stops collected.


Processing 120M.csv
Processing 18B.csv
Processing 195J.csv
For route 195J, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 19M.csv
Processing 218D.csv
Processing 219.csv
Processing 225.csv
Processing 227B.csv
Processing 229.csv
Processing 22T.csv
Processing 242G.csv
Processing 3JJ.csv
Processing 102B_218L.csv
Processing 102C.csv
Processing 102CJ.csv
Processing 102D.csv
Processing 102F.csv
Processing 102H.csv
Processing 102K.csv
Processing 102N.csv


20 routes processed, 753 stops collected.


Processing 102P.csv
Processing 102S.csv
Processing 102W.csv
Processing 103.csv
Processing 104A.csv
Processing 104G.csv
Processing 104R.csv
Processing 104S.csv
Processing 104T.csv
Processing 105.csv
Processing 105M.csv
Processing 107.csv
Processing 107B.csv
Processing 107G.csv
Processing 107H.csv
Processing 107J.csv
Processing 107K.csv
Processing 107KV.csv
For route 107KV, no start times data foun

Processing 156.csv
Processing 156B.csv
Processing 156H.csv


180 routes processed, 1260 stops collected.


Processing 156L.csv
Processing 156L_216G.csv
For route 156L_216G, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 156V.csv
Processing 156V_118W.csv
Processing 156_204U.csv
Processing 156_216.csv
Processing 156_216G.csv
Processing 156_216L.csv
For route 156_216L, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 156_288H.csv
For route 156_288H, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 156_299.csv
Processing 158.csv
For route 158, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 158D_L.csv
For route 158D_L, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 158F.csv
Processing 158FB.csv


Processing 217D_L.csv
For route 217D_L, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 217M.csv
Processing 217X.csv
Processing 218.csv
Processing 218C.csv
Processing 218D_L.csv
For route 218D_L, no start times data found so using generic template instead, one trip at 10am in each direction.


340 routes processed, 1420 stops collected.


Processing 218HG.csv
For route 218HG, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 218L.csv
For route 218L, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 218L_V.csv
Processing 218_19M.csv
For route 218_19M, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 219_229.csv
Processing 21B.csv
Processing 21W.csv
Processing 21W_107J.csv
Processing 21W_2017J.csv
Processing 21_107J.csv
Processing 220V.csv
Processing 222L.c

Processing 3Z.csv
Processing 4.csv
Processing 40.csv
Processing 401.csv


580 routes processed, 1616 stops collected.


Processing 40D.csv
Processing 40N.csv
Processing 40T.csv
Processing 40V.csv
Processing 41.csv
Processing 41C.csv
Processing 41K.csv
Processing 41N.csv
Processing 42.csv
Processing 43.csv
Processing 44L.csv
Processing 44X.csv
Processing 45.csv
Processing 45A.csv
Processing 45G.csv
Processing 45HK.csv
Processing 45J.csv
Processing 46P.csv
For route 46P, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 47C.csv
Processing 47F.csv


600 routes processed, 1627 stops collected.


Processing 47K.csv
Processing 47Y_90D.csv
Processing 47Y_90U.csv
Processing 48.csv
Processing 49.csv
For route 49, no start times data found so using generic template instead, one trip at 10am in each direction.
Processing 49A.csv
Processing 49E.csv
Processing 49J.csv
Processing 49K.csv
For route 49K, no start times data found so using gener

Processing Sec-Airport.csv


In [18]:
# generating output
pd.DataFrame(routesCollector).to_csv(outputFolder+'routes.txt',index=False)
stopsDB.sort_values('stop_id').to_csv(outputFolder+'stops.txt',index=False)
pd.DataFrame(tripsCollector).to_csv(outputFolder+'trips.txt',index=False)
# no stop_times.. that is already appended to in the main loop.


In [19]:
'#'*100

'####################################################################################################'