{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# to do\n",
"- with the stops-geolocated-combined-orig.csv , collapse the duplicates\n",
"- do some data analysis\n",
"- make geohash?\n",
"- or, take significant digits of lat-longs"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/7ACA0CC0CA0C7B21/temp/py36/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 56, got 52\n",
" return f(*args, **kwds)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from collections import OrderedDict\n",
"import openlocationcode as olc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (5082, 6)\n",
"columns: Index(['source', 'stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_desc'], dtype='object')\n",
"\n",
"Sample data:\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" stop_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" gps-logs-ghz | \n",
" 889 | \n",
" 6 NUMBER | \n",
" 17.3918 | \n",
" 78.51082 | \n",
" 6 Number Towards Tilak Nagar | \n",
"
\n",
" \n",
" 1800 | \n",
" kodali-stops | \n",
" 348 | \n",
" koti | \n",
" 17.38471 | \n",
" 78.48426 | \n",
" | \n",
"
\n",
" \n",
" 3365 | \n",
" shakti-gtfs | \n",
" 1 | \n",
" YANAMPET | \n",
" 17.45620948 | \n",
" 78.65957737 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source stop_id stop_name stop_lat stop_lon \\\n",
"0 gps-logs-ghz 889 6 NUMBER 17.3918 78.51082 \n",
"1800 kodali-stops 348 koti 17.38471 78.48426 \n",
"3365 shakti-gtfs 1 YANAMPET 17.45620948 78.65957737 \n",
"\n",
" stop_desc \n",
"0 6 Number Towards Tilak Nagar \n",
"1800 \n",
"3365 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('stops-geolocated-combined-orig.csv',dtype=str).fillna('')\n",
"print('shape:',df.shape)\n",
"print('columns:',df.columns)\n",
"print('\\nSample data:')\n",
"df.drop_duplicates('source')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['source', 'stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_desc'], dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## some data cleaning"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['hyderguda x road ', 'Owaisi Hospital ', 'dilsukh nagar ',\n",
" 'snehapuri colony ', 'survey of india ', 'ngri ', 'tarnaka ',\n",
" 'Koheda X Road ', 'Hafeezpet ', 'koheda x road ', 'hafeezpet ',\n",
" 'huda colony ', 'water Tank ', 'jeedimetla depot ', 'C I S F ',\n",
" 'arts college ', 'Gandhi bavan ', 'rtc colony ', 'Chowdariguda ',\n",
" 'ramnagar e seva ', 'Water Tank ', 'gangaram ',\n",
" 'Chintalakunta Checkpost ', 'baba nagar ', 'SAFILGUDA ',\n",
" 'airport ', 'Abids ', 'Badangpet ', 'nmdc colony ',\n",
" 'Vanasthalipuram ', 'rr nagar ', 'Rr Nagar ', ' Zphs',\n",
" 'shamshabad ', 'rajendranagar depot ', 'INDRA PRASTA '],\n",
" dtype=object)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[ df.stop_name != df.stop_name.str.strip() ].stop_name.unique()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Bhulkapur '], dtype=object)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[ df.stop_desc != df.stop_desc.str.strip() ].stop_desc.unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"these have leading or trailing spaces."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df.stop_name = df.stop_name.apply(lambda x: x.strip())\n",
"df.stop_desc = df.stop_desc.apply(lambda x: x.strip())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([], dtype=object)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[ df.stop_id != df.stop_id.str.strip() ].stop_name.unique()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['TAMIL CHURCH, SECUNDERABAD', 'SUPER BAZAR, MALAKPET',\n",
" 'WOMENS COLLEGE, KOTHI', 'NIZAM COLLEGE, BASHEERBAGH',\n",
" 'REGIMENTAL BAZAR, SECUNDERABAD',\n",
" 'BABU JAGJEEVAN RAM BHAVAN, KOTHAPET',\n",
" 'CHILKALGUDA X ROADS, SECUNDERABAD',\n",
" 'RATHIFILE BUS STATION, SECUNDERABAD', 'DBR MILLS, ELCHIGUDA',\n",
" 'PUTLIBOWLI, KOTHI', 'MODERN BAKERY, UPPAL',\n",
" 'GURUDWARA, SECUNDERABAD', 'HYDERABAD PUBLIC SCHOOL, BEGUMPET',\n",
" 'CENTRAL BUS STATION, GOWLIGUDA', 'STATE ARCHIVES, TARNAKA',\n",
" 'HAKIMPET, SECUNDERABAD', 'SYNDICATE BANK, SHIVAM ROAD',\n",
" 'LADIES HOSTEL, MISRIGUNJ', 'LADIES HOSTEL, ANGRAU',\n",
" 'WEAKER SECTION COLONY, KUNDANPALLY',\n",
" 'WEAKER SECTION COLONY, PEDDA AMBERPET',\n",
" 'OSMANIA MEDICAL COLLEGE, KOTHI', 'PUSHPANJALI COMPLEX, KOTHI',\n",
" 'MAISAMMA TEMPLE, MALLAPUR', 'YASHODA HOSPITAL, SOMAJIGUDA',\n",
" 'BABU JAGJEEVAN RAM COLLEGE, LAKDI-KA-PUL',\n",
" 'POLICE LINES, BEGUMPET', 'NMDC, MASAB TANK'], dtype=object)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for commas\n",
"df[ df.stop_name.str.contains(',') ].stop_name.unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# ohh many commas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### > did a basic names cleanup\n",
"Left the commas as-is"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## limiting lat-longs to significant digits\n",
"Tool for measuring distance between a lat-long pair: https://www.movable-type.co.uk/scripts/latlong.html\n",
"\n",
"4 digit precision: Max distance: 13.8 m : too much. \n",
"Screenshot: ![4digit](https://i.imgur.com/G4cwDkz.png)\n",
"\n",
"5 digit precision: Max distance: 1.4 m : better.\n",
"Screenshot: ![5digit](https://i.imgur.com/L6cB6FU.png)\n",
"\n",
"#### > So, let's go with 5 digit precision: ~~Create new columns where~~ lat-longs are rounded to 5 decimal points."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" stop_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" gps-logs-ghz | \n",
" 889 | \n",
" 6 NUMBER | \n",
" 17.39180 | \n",
" 78.51082 | \n",
" 6 Number Towards Tilak Nagar | \n",
"
\n",
" \n",
" 1 | \n",
" gps-logs-ghz | \n",
" 887 | \n",
" 6 NUMBER | \n",
" 17.39174 | \n",
" 78.51236 | \n",
" 6 number towards Amberpet | \n",
"
\n",
" \n",
" 2 | \n",
" gps-logs-ghz | \n",
" 2469 | \n",
" 9 NUMBER X ROAD | \n",
" 17.35656 | \n",
" 78.43831 | \n",
" 9number x road towards | \n",
"
\n",
" \n",
" 3 | \n",
" gps-logs-ghz | \n",
" 2468 | \n",
" 9 NUMBER X ROAD | \n",
" 17.35577 | \n",
" 78.43393 | \n",
" 9number X Road | \n",
"
\n",
" \n",
" 4 | \n",
" gps-logs-ghz | \n",
" 1685 | \n",
" A G COLONY X ROAD | \n",
" 17.44416 | \n",
" 78.43300 | \n",
" a g colony x road towards | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source stop_id stop_name stop_lat stop_lon \\\n",
"0 gps-logs-ghz 889 6 NUMBER 17.39180 78.51082 \n",
"1 gps-logs-ghz 887 6 NUMBER 17.39174 78.51236 \n",
"2 gps-logs-ghz 2469 9 NUMBER X ROAD 17.35656 78.43831 \n",
"3 gps-logs-ghz 2468 9 NUMBER X ROAD 17.35577 78.43393 \n",
"4 gps-logs-ghz 1685 A G COLONY X ROAD 17.44416 78.43300 \n",
"\n",
" stop_desc \n",
"0 6 Number Towards Tilak Nagar \n",
"1 6 number towards Amberpet \n",
"2 9number x road towards \n",
"3 9number X Road \n",
"4 a g colony x road towards "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['stop_lat'] = df.stop_lat.apply( lambda x: round(float(x),5) )\n",
"df['stop_lon'] = df.stop_lon.apply( lambda x: round(float(x),5) )\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Open location code"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_lat | \n",
" stop_lon | \n",
" stop_desc | \n",
" olc8 | \n",
" olc10 | \n",
" olc11 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" gps-logs-ghz | \n",
" 889 | \n",
" 6 NUMBER | \n",
" 17.39180 | \n",
" 78.51082 | \n",
" 6 Number Towards Tilak Nagar | \n",
" 7J9W9GR6+ | \n",
" 7J9W9GR6+P8 | \n",
" 7J9W9GR6+P8G | \n",
"
\n",
" \n",
" 1 | \n",
" gps-logs-ghz | \n",
" 887 | \n",
" 6 NUMBER | \n",
" 17.39174 | \n",
" 78.51236 | \n",
" 6 number towards Amberpet | \n",
" 7J9W9GR6+ | \n",
" 7J9W9GR6+MW | \n",
" 7J9W9GR6+MWX | \n",
"
\n",
" \n",
" 2 | \n",
" gps-logs-ghz | \n",
" 2469 | \n",
" 9 NUMBER X ROAD | \n",
" 17.35656 | \n",
" 78.43831 | \n",
" 9number x road towards | \n",
" 7J9W9C4Q+ | \n",
" 7J9W9C4Q+J8 | \n",
" 7J9W9C4Q+J8F | \n",
"
\n",
" \n",
" 3 | \n",
" gps-logs-ghz | \n",
" 2468 | \n",
" 9 NUMBER X ROAD | \n",
" 17.35577 | \n",
" 78.43393 | \n",
" 9number X Road | \n",
" 7J9W9C4M+ | \n",
" 7J9W9C4M+8H | \n",
" 7J9W9C4M+8H3 | \n",
"
\n",
" \n",
" 4 | \n",
" gps-logs-ghz | \n",
" 1685 | \n",
" A G COLONY X ROAD | \n",
" 17.44416 | \n",
" 78.43300 | \n",
" a g colony x road towards | \n",
" 7J9WCCVM+ | \n",
" 7J9WCCVM+M5 | \n",
" 7J9WCCVM+M59 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source stop_id stop_name stop_lat stop_lon \\\n",
"0 gps-logs-ghz 889 6 NUMBER 17.39180 78.51082 \n",
"1 gps-logs-ghz 887 6 NUMBER 17.39174 78.51236 \n",
"2 gps-logs-ghz 2469 9 NUMBER X ROAD 17.35656 78.43831 \n",
"3 gps-logs-ghz 2468 9 NUMBER X ROAD 17.35577 78.43393 \n",
"4 gps-logs-ghz 1685 A G COLONY X ROAD 17.44416 78.43300 \n",
"\n",
" stop_desc olc8 olc10 olc11 \n",
"0 6 Number Towards Tilak Nagar 7J9W9GR6+ 7J9W9GR6+P8 7J9W9GR6+P8G \n",
"1 6 number towards Amberpet 7J9W9GR6+ 7J9W9GR6+MW 7J9W9GR6+MWX \n",
"2 9number x road towards 7J9W9C4Q+ 7J9W9C4Q+J8 7J9W9C4Q+J8F \n",
"3 9number X Road 7J9W9C4M+ 7J9W9C4M+8H 7J9W9C4M+8H3 \n",
"4 a g colony x road towards 7J9WCCVM+ 7J9WCCVM+M5 7J9WCCVM+M59 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['olc8'] = df.apply(lambda row: olc.encode(float(row['stop_lat']), float(row['stop_lon']),8), axis=1)\n",
"df['olc10'] = df.apply(lambda row: olc.encode(float(row['stop_lat']), float(row['stop_lon']),10), axis=1)\n",
"df['olc11'] = df.apply(lambda row: olc.encode(float(row['stop_lat']), float(row['stop_lon']),11), axis=1)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Grouping the data together by the rounded lat and long.~~olc10~~"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def cleanList(arr):\n",
" # drop blanks\n",
" arr1 = arr[arr != ''].str.lower()\n",
" \n",
" # if nothing then return blank\n",
" if not len(arr1): return []\n",
" \n",
" # remove duplicates\n",
" arr2 = list(set(arr1))\n",
" arr2.sort()\n",
" \n",
" if len(arr2)>2: print(arr2)\n",
" \n",
" # return list(ser2) if len(ser2)>1 else list(ser2)[0]\n",
" return arr2\n",
"\n",
"def grouper1(x):\n",
" sources = cleanList(x.source)\n",
" stop_ids = cleanList(x.stop_id)\n",
" stop_names = cleanList(x.stop_name)\n",
" stop_descs = cleanList(x.stop_desc)\n",
" \n",
" collapsedRow = OrderedDict({\n",
" 'source': '|'.join(sources),\n",
" 'stop_id': '|'.join(stop_ids),\n",
" 'stop_name': '|'.join(stop_names),\n",
" 'stop_desc': '|'.join(stop_descs),\n",
" \n",
" 'num_source': len(sources),\n",
" 'num_stop_id': len(stop_ids),\n",
" 'num_stop_name': len(stop_names),\n",
" 'num_stop_desc': len(stop_descs) # if isinstance(stop_descs,list) else 1,\n",
" })\n",
" return pd.Series(collapsedRow)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_desc | \n",
" num_source | \n",
" num_stop_id | \n",
" num_stop_name | \n",
" num_stop_desc | \n",
"
\n",
" \n",
" stop_lat | \n",
" stop_lon | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 16.95312 | \n",
" 78.34685 | \n",
" shakti-gtfs | \n",
" 1592 | \n",
" keshampet | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 16.98563 | \n",
" 78.36715 | \n",
" shakti-gtfs | \n",
" 1658 | \n",
" kothapet village | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 17.00153 | \n",
" 78.69018 | \n",
" gps-logs-ghz | \n",
" 8101 | \n",
" takkalapally | \n",
" takkalapally | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 17.00934 | \n",
" 78.68671 | \n",
" gps-logs-ghz | \n",
" 8100 | \n",
" chintapatla | \n",
" chintapatla | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 17.01231 | \n",
" 78.38711 | \n",
" shakti-gtfs | \n",
" 968 | \n",
" santapur | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source stop_id stop_name stop_desc \\\n",
"stop_lat stop_lon \n",
"16.95312 78.34685 shakti-gtfs 1592 keshampet \n",
"16.98563 78.36715 shakti-gtfs 1658 kothapet village \n",
"17.00153 78.69018 gps-logs-ghz 8101 takkalapally takkalapally \n",
"17.00934 78.68671 gps-logs-ghz 8100 chintapatla chintapatla \n",
"17.01231 78.38711 shakti-gtfs 968 santapur \n",
"\n",
" num_source num_stop_id num_stop_name num_stop_desc \n",
"stop_lat stop_lon \n",
"16.95312 78.34685 1 1 1 0 \n",
"16.98563 78.36715 1 1 1 0 \n",
"17.00153 78.69018 1 1 1 1 \n",
"17.00934 78.68671 1 1 1 1 \n",
"17.01231 78.38711 1 1 1 0 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = df.groupby(['stop_lat','stop_lon']).apply(grouper1)\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(1502, 8)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# multiple sources:\n",
"df1[df1.num_source > 1].shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"stop_lat stop_lon\n",
"17.13406 78.43328 gps-logs-ghz|kodali-stops\n",
"17.13454 78.43347 gps-logs-ghz|kodali-stops\n",
"17.13663 78.43695 gps-logs-ghz|kodali-stops\n",
"17.13699 78.43701 gps-logs-ghz|kodali-stops\n",
"17.15359 78.44789 gps-logs-ghz|kodali-stops\n",
"Name: source, dtype: object"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1[df1.num_source > 1].source.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, 8)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1[df1.num_source > 2].shape\n",
"# ok so not more than 2."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### > 1502 entries found common"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### where two or more different stop_ids have been grouped"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (10, 8)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" stop_id | \n",
" stop_name | \n",
" stop_desc | \n",
" source | \n",
"
\n",
" \n",
" stop_lat | \n",
" stop_lon | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 17.25164 | \n",
" 78.42977 | \n",
" 1786|1826 | \n",
" airport | \n",
" airport|airport towards | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.33010 | \n",
" 78.47073 | \n",
" 3726|3730 | \n",
" farooq nagar bus station | \n",
" farooq nagar|farooq nagar a | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.33039 | \n",
" 78.57514 | \n",
" 3010|3013 | \n",
" rythu bazar | \n",
" rythu bazar|rythu bazar towards | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.35008 | \n",
" 78.54759 | \n",
" 1852|442 | \n",
" lb nagar | \n",
" lb nagar to|lb nagar towards ntr nagar | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.39643 | \n",
" 78.47094 | \n",
" 358|371 | \n",
" public gardens | \n",
" public gardens to words lakadi kapool|public ... | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.39883 | \n",
" 78.49402 | \n",
" 355|362 | \n",
" narayanaguda | \n",
" narayanguda towards koti|narayanguda bloodban... | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.40201 | \n",
" 78.49511 | \n",
" 354|361 | \n",
" chikkadapalli | \n",
" chikkadapalli to words narayanguda|chikkadapal... | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.43340 | \n",
" 78.50695 | \n",
" 2202|2204 | \n",
" chilkalguda|singareni colony | \n",
" chilkalguda towards alagadda baavi|singareni c... | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.44627 | \n",
" 78.53569 | \n",
" 2017|2018 | \n",
" sairam theatre | \n",
" sairam theatre|sairam theatre to | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
" 17.57543 | \n",
" 78.42302 | \n",
" 1461|1462 | \n",
" gandimaisamma | \n",
" gandimaisamma|gandimaisamma towards baharadura... | \n",
" gps-logs-ghz|kodali-stops | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_id stop_name \\\n",
"stop_lat stop_lon \n",
"17.25164 78.42977 1786|1826 airport \n",
"17.33010 78.47073 3726|3730 farooq nagar bus station \n",
"17.33039 78.57514 3010|3013 rythu bazar \n",
"17.35008 78.54759 1852|442 lb nagar \n",
"17.39643 78.47094 358|371 public gardens \n",
"17.39883 78.49402 355|362 narayanaguda \n",
"17.40201 78.49511 354|361 chikkadapalli \n",
"17.43340 78.50695 2202|2204 chilkalguda|singareni colony \n",
"17.44627 78.53569 2017|2018 sairam theatre \n",
"17.57543 78.42302 1461|1462 gandimaisamma \n",
"\n",
" stop_desc \\\n",
"stop_lat stop_lon \n",
"17.25164 78.42977 airport|airport towards \n",
"17.33010 78.47073 farooq nagar|farooq nagar a \n",
"17.33039 78.57514 rythu bazar|rythu bazar towards \n",
"17.35008 78.54759 lb nagar to|lb nagar towards ntr nagar \n",
"17.39643 78.47094 public gardens to words lakadi kapool|public ... \n",
"17.39883 78.49402 narayanguda towards koti|narayanguda bloodban... \n",
"17.40201 78.49511 chikkadapalli to words narayanguda|chikkadapal... \n",
"17.43340 78.50695 chilkalguda towards alagadda baavi|singareni c... \n",
"17.44627 78.53569 sairam theatre|sairam theatre to \n",
"17.57543 78.42302 gandimaisamma|gandimaisamma towards baharadura... \n",
"\n",
" source \n",
"stop_lat stop_lon \n",
"17.25164 78.42977 gps-logs-ghz|kodali-stops \n",
"17.33010 78.47073 gps-logs-ghz|kodali-stops \n",
"17.33039 78.57514 gps-logs-ghz|kodali-stops \n",
"17.35008 78.54759 gps-logs-ghz|kodali-stops \n",
"17.39643 78.47094 gps-logs-ghz|kodali-stops \n",
"17.39883 78.49402 gps-logs-ghz|kodali-stops \n",
"17.40201 78.49511 gps-logs-ghz|kodali-stops \n",
"17.43340 78.50695 gps-logs-ghz|kodali-stops \n",
"17.44627 78.53569 gps-logs-ghz|kodali-stops \n",
"17.57543 78.42302 gps-logs-ghz|kodali-stops "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df1[df1.num_stop_id > 1]\n",
"print('shape:',df2.shape)\n",
"df2[['stop_id','stop_name','stop_desc','source']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### > 10 pairings of stop_id's have happened\n",
"In these, the lat-long values were identical."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### where same stop_id but different stop names"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df3 = df1[(df1.num_stop_name > 1) & (df1.num_stop_id == 1)]\n",
"#df3[['source','stop_id','stop_name','stop_desc']]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(20, 8)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"stop_lat stop_lon\n",
"17.35085 78.35804 golden farm|golden form\n",
"17.35111 78.35807 golden farm|golden form\n",
"17.39441 78.49884 barkathapura|barkathpura\n",
"17.39454 78.49849 barkathapura|barkathpura\n",
"17.39593 78.49717 barkathapura|barkathpura\n",
"17.42582 78.53287 tarnaka|tarnaka u\n",
"17.44053 78.49793 clock tower|clocktower\n",
"17.44091 78.36000 indira nagar|indiranagar\n",
"17.44394 78.47126 prakash nagar|prakashnagar\n",
"17.44720 78.37793 cyber pearl|cyber peral\n",
"17.46724 78.44535 bala nagar|balanagar\n",
"17.46753 78.44443 bala nagar towards kp|balanagar\n",
"17.48018 78.58178 nagarjuna nagar|nagarjuna nagar colony\n",
"17.50233 78.31185 jyothi nagar|jyothi nagar lp\n",
"17.50482 78.30836 ashok nagar|ashok nagar lp\n",
"17.52145 78.51738 cantonment hospital|contonment hospital\n",
"17.52168 78.51792 cantonment hospital|contonment hospital\n",
"17.54257 78.53191 c i s f|cisf\n",
"17.54280 78.53160 c i s f|cisf\n",
"17.54335 78.37269 indira nagar|indiranagar\n",
"Name: stop_name, dtype: object"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3.stop_name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### > 20 entries where stop_id was same across datasets but names have spelling differences"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check: have any lat-longs been grouped or not?"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\na = df1.stop_lat_orig.apply(lambda x: isinstance(x,list))\\ndf1[a]\\n'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"a = df1.stop_lat_orig.apply(lambda x: isinstance(x,list))\n",
"df1[a]\n",
"'''"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### > We actually didn't really group any \"slightly different\" lat-longs here.\n",
"They were all either distinct after 5-digit rounding, or were identical already. \n",
"(done in previous run, then changed to keep stops like that only and not make new columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## open location code"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3564, 10)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_lat | \n",
" stop_lon | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_desc | \n",
" num_source | \n",
" num_stop_id | \n",
" num_stop_name | \n",
" num_stop_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 16.95312 | \n",
" 78.34685 | \n",
" shakti-gtfs | \n",
" 1592 | \n",
" keshampet | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 16.98563 | \n",
" 78.36715 | \n",
" shakti-gtfs | \n",
" 1658 | \n",
" kothapet village | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 17.00153 | \n",
" 78.69018 | \n",
" gps-logs-ghz | \n",
" 8101 | \n",
" takkalapally | \n",
" takkalapally | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 17.00934 | \n",
" 78.68671 | \n",
" gps-logs-ghz | \n",
" 8100 | \n",
" chintapatla | \n",
" chintapatla | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 17.01231 | \n",
" 78.38711 | \n",
" shakti-gtfs | \n",
" 968 | \n",
" santapur | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_lat stop_lon source stop_id stop_name stop_desc \\\n",
"0 16.95312 78.34685 shakti-gtfs 1592 keshampet \n",
"1 16.98563 78.36715 shakti-gtfs 1658 kothapet village \n",
"2 17.00153 78.69018 gps-logs-ghz 8101 takkalapally takkalapally \n",
"3 17.00934 78.68671 gps-logs-ghz 8100 chintapatla chintapatla \n",
"4 17.01231 78.38711 shakti-gtfs 968 santapur \n",
"\n",
" num_source num_stop_id num_stop_name num_stop_desc \n",
"0 1 1 1 0 \n",
"1 1 1 1 0 \n",
"2 1 1 1 1 \n",
"3 1 1 1 1 \n",
"4 1 1 1 0 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lets try open location code\n",
"df4 = df1.reset_index()\n",
"print(df4.shape)\n",
"df4.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_lat | \n",
" stop_lon | \n",
" source | \n",
" stop_id | \n",
" stop_name | \n",
" stop_desc | \n",
" num_source | \n",
" num_stop_id | \n",
" num_stop_name | \n",
" num_stop_desc | \n",
" olc8 | \n",
" olc10 | \n",
" olc11 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 16.95312 | \n",
" 78.34685 | \n",
" shakti-gtfs | \n",
" 1592 | \n",
" keshampet | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 7J8WX83W+ | \n",
" 7J8WX83W+6P | \n",
" 7J8WX83W+6PX | \n",
"
\n",
" \n",
" 1 | \n",
" 16.98563 | \n",
" 78.36715 | \n",
" shakti-gtfs | \n",
" 1658 | \n",
" kothapet village | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 7J8WX9P8+ | \n",
" 7J8WX9P8+7V | \n",
" 7J8WX9P8+7V2 | \n",
"
\n",
" \n",
" 2 | \n",
" 17.00153 | \n",
" 78.69018 | \n",
" gps-logs-ghz | \n",
" 8101 | \n",
" takkalapally | \n",
" takkalapally | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 7J9W2M2R+ | \n",
" 7J9W2M2R+J3 | \n",
" 7J9W2M2R+J37 | \n",
"
\n",
" \n",
" 3 | \n",
" 17.00934 | \n",
" 78.68671 | \n",
" gps-logs-ghz | \n",
" 8100 | \n",
" chintapatla | \n",
" chintapatla | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 7J9W2M5P+ | \n",
" 7J9W2M5P+PM | \n",
" 7J9W2M5P+PMP | \n",
"
\n",
" \n",
" 4 | \n",
" 17.01231 | \n",
" 78.38711 | \n",
" shakti-gtfs | \n",
" 968 | \n",
" santapur | \n",
" | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 7J9W296P+ | \n",
" 7J9W296P+WR | \n",
" 7J9W296P+WRH | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_lat stop_lon source stop_id stop_name stop_desc \\\n",
"0 16.95312 78.34685 shakti-gtfs 1592 keshampet \n",
"1 16.98563 78.36715 shakti-gtfs 1658 kothapet village \n",
"2 17.00153 78.69018 gps-logs-ghz 8101 takkalapally takkalapally \n",
"3 17.00934 78.68671 gps-logs-ghz 8100 chintapatla chintapatla \n",
"4 17.01231 78.38711 shakti-gtfs 968 santapur \n",
"\n",
" num_source num_stop_id num_stop_name num_stop_desc olc8 \\\n",
"0 1 1 1 0 7J8WX83W+ \n",
"1 1 1 1 0 7J8WX9P8+ \n",
"2 1 1 1 1 7J9W2M2R+ \n",
"3 1 1 1 1 7J9W2M5P+ \n",
"4 1 1 1 0 7J9W296P+ \n",
"\n",
" olc10 olc11 \n",
"0 7J8WX83W+6P 7J8WX83W+6PX \n",
"1 7J8WX9P8+7V 7J8WX9P8+7V2 \n",
"2 7J9W2M2R+J3 7J9W2M2R+J37 \n",
"3 7J9W2M5P+PM 7J9W2M5P+PMP \n",
"4 7J9W296P+WR 7J9W296P+WRH "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4['olc8'] = df4.apply(lambda row: olc.encode(row['stop_lat'], row['stop_lon'],8), axis=1)\n",
"df4['olc10'] = df4.apply(lambda row: olc.encode(row['stop_lat'], row['stop_lon'],10), axis=1)\n",
"df4['olc11'] = df4.apply(lambda row: olc.encode(row['stop_lat'], row['stop_lon'],11), axis=1)\n",
"df4.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## sort and rearrange"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" stop_id | \n",
" stop_name | \n",
" stop_desc | \n",
" source | \n",
" stop_lat | \n",
" stop_lon | \n",
" olc8 | \n",
" olc10 | \n",
" olc11 | \n",
" num_source | \n",
" num_stop_id | \n",
" num_stop_name | \n",
" num_stop_desc | \n",
"
\n",
" \n",
" \n",
" \n",
" 1313 | \n",
" 887 | \n",
" 6 number | \n",
" 6 number towards amberpet | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.39174 | \n",
" 78.51236 | \n",
" 7J9W9GR6+ | \n",
" 7J9W9GR6+MW | \n",
" 7J9W9GR6+MWX | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1315 | \n",
" 889 | \n",
" 6 number | \n",
" 6 number towards tilak nagar | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.39180 | \n",
" 78.51082 | \n",
" 7J9W9GR6+ | \n",
" 7J9W9GR6+P8 | \n",
" 7J9W9GR6+P8G | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 919 | \n",
" 2468 | \n",
" 9 number x road | \n",
" 9number x road | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.35577 | \n",
" 78.43393 | \n",
" 7J9W9C4M+ | \n",
" 7J9W9C4M+8H | \n",
" 7J9W9C4M+8H3 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 924 | \n",
" 2469 | \n",
" 9 number x road | \n",
" 9number x road towards | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.35656 | \n",
" 78.43831 | \n",
" 7J9W9C4Q+ | \n",
" 7J9W9C4Q+J8 | \n",
" 7J9W9C4Q+J8F | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2203 | \n",
" 1685 | \n",
" a g colony x road | \n",
" a g colony x road towards | \n",
" gps-logs-ghz | \n",
" 17.44416 | \n",
" 78.43300 | \n",
" 7J9WCCVM+ | \n",
" 7J9WCCVM+M5 | \n",
" 7J9WCCVM+M59 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2664 | \n",
" 1013 | \n",
" a s rao nagar | \n",
" a s rao nagar towards radhika | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.47951 | \n",
" 78.55832 | \n",
" 7J9WFHH5+ | \n",
" 7J9WFHH5+R8 | \n",
" 7J9WFHH5+R84 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2672 | \n",
" 1029 | \n",
" a s rao nagar | \n",
" a s rao nagar towards rukmini puram | \n",
" gps-logs-ghz|kodali-stops | \n",
" 17.48003 | \n",
" 78.55683 | \n",
" 7J9WFHJ4+ | \n",
" 7J9WFHJ4+2P | \n",
" 7J9WFHJ4+2P8 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1476 | \n",
" 781 | \n",
" aaykar bhavan | \n",
" | \n",
" shakti-gtfs | \n",
" 17.40118 | \n",
" 78.47315 | \n",
" 7J9WCF2F+ | \n",
" 7J9WCF2F+F7 | \n",
" 7J9WCF2F+F7C | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 378 | \n",
" 3567 | \n",
" abdullapur met | \n",
" abdullapur met | \n",
" gps-logs-ghz | \n",
" 17.31004 | \n",
" 78.68643 | \n",
" 7J9W8M6P+ | \n",
" 7J9W8M6P+2H | \n",
" 7J9W8M6P+2H7 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 377 | \n",
" 3594 | \n",
" abdullapur met | \n",
" abdullapur met | \n",
" gps-logs-ghz | \n",
" 17.30983 | \n",
" 78.68647 | \n",
" 7J9W8M5P+ | \n",
" 7J9W8M5P+WH | \n",
" 7J9W8M5P+WHQ | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" stop_id stop_name stop_desc \\\n",
"1313 887 6 number 6 number towards amberpet \n",
"1315 889 6 number 6 number towards tilak nagar \n",
"919 2468 9 number x road 9number x road \n",
"924 2469 9 number x road 9number x road towards \n",
"2203 1685 a g colony x road a g colony x road towards \n",
"2664 1013 a s rao nagar a s rao nagar towards radhika \n",
"2672 1029 a s rao nagar a s rao nagar towards rukmini puram \n",
"1476 781 aaykar bhavan \n",
"378 3567 abdullapur met abdullapur met \n",
"377 3594 abdullapur met abdullapur met \n",
"\n",
" source stop_lat stop_lon olc8 olc10 \\\n",
"1313 gps-logs-ghz|kodali-stops 17.39174 78.51236 7J9W9GR6+ 7J9W9GR6+MW \n",
"1315 gps-logs-ghz|kodali-stops 17.39180 78.51082 7J9W9GR6+ 7J9W9GR6+P8 \n",
"919 gps-logs-ghz|kodali-stops 17.35577 78.43393 7J9W9C4M+ 7J9W9C4M+8H \n",
"924 gps-logs-ghz|kodali-stops 17.35656 78.43831 7J9W9C4Q+ 7J9W9C4Q+J8 \n",
"2203 gps-logs-ghz 17.44416 78.43300 7J9WCCVM+ 7J9WCCVM+M5 \n",
"2664 gps-logs-ghz|kodali-stops 17.47951 78.55832 7J9WFHH5+ 7J9WFHH5+R8 \n",
"2672 gps-logs-ghz|kodali-stops 17.48003 78.55683 7J9WFHJ4+ 7J9WFHJ4+2P \n",
"1476 shakti-gtfs 17.40118 78.47315 7J9WCF2F+ 7J9WCF2F+F7 \n",
"378 gps-logs-ghz 17.31004 78.68643 7J9W8M6P+ 7J9W8M6P+2H \n",
"377 gps-logs-ghz 17.30983 78.68647 7J9W8M5P+ 7J9W8M5P+WH \n",
"\n",
" olc11 num_source num_stop_id num_stop_name num_stop_desc \n",
"1313 7J9W9GR6+MWX 2 1 1 1 \n",
"1315 7J9W9GR6+P8G 2 1 1 1 \n",
"919 7J9W9C4M+8H3 2 1 1 1 \n",
"924 7J9W9C4Q+J8F 2 1 1 1 \n",
"2203 7J9WCCVM+M59 1 1 1 1 \n",
"2664 7J9WFHH5+R84 2 1 1 1 \n",
"2672 7J9WFHJ4+2P8 2 1 1 1 \n",
"1476 7J9WCF2F+F7C 1 1 1 0 \n",
"378 7J9W8M6P+2H7 1 1 1 1 \n",
"377 7J9W8M5P+WHQ 1 1 1 1 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5 = df4[['stop_id','stop_name','stop_desc','source','stop_lat','stop_lon','olc8','olc10','olc11','num_source','num_stop_id','num_stop_name','num_stop_desc']]\\\n",
" .sort_values(['stop_name','source','stop_id'])\n",
"df5.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# lets export this\n",
"df5.to_csv('stops-geolocated-combined-1.csv',index_label='sr')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}