Author: Adam Pioterek <adam.pioterek@protonmail.ch>
database directory
.gitignore | 9 ++------- database/cron | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ database/db.schema | 30 ++++++++++++++++++++++++++++++ | 0
diff --git a/.gitignore b/.gitignore index 03e31c52b899a1f7d7fae329bddd1c922968b798..f59b45660e95a0a997e5ddfb6988bdd928658004 100644 --- a/.gitignore +++ b/.gitignore @@ -45,10 +45,5 @@ /captures .externalNativeBuild app/release/ -research/cron -research/peka-vm-api.md -research/svg/ -research/timetable/ -research/timetable_new/ -research/timetable.db -research/timetable.db-journal +research/ +database/timetable* diff --git a/database/cron b/database/cron new file mode 100644 index 0000000000000000000000000000000000000000..36b7f43366ebdccab4f9f4e6d8221507fca495de --- /dev/null +++ b/database/cron @@ -0,0 +1,48 @@ +#!/usr/bin/bash + +d=$(date -Iminutes) + +cd /home/qeebmisk + +set +e +/usr/bin/python3.6 scraper.py >new_meta 2> "scraper_$d.log" +result=$? +if [ $result -eq 0 ] +then + /usr/bin/sqlite3 timetable.db 'pragma integrity_check;' + if [ $? -eq 0 ] + then + trap except ERR + + echo 'making metadata' >> "scraper_$d.log" + mv new_meta timetable.db.meta + sha512sum timetable.db | cut -d ' ' -f 1 >>timetable.db.meta + ls -l timetable.db | cut -d ' ' -f 5 >>timetable.db.meta + echo '1.1.0' >>timetable.db.meta + echo 'timetable.db.xz' >>timetable.db.meta + + xz -z -k timetable.db + + echo 'moving' >> "scraper_$d.log" + mv timetable.db.xz public_html/w/data/media/programmes/bimba/timetable.db.xz + mv timetable.db.meta public_html/w/data/media/programmes/bimba/timetable.db.meta + + echo 'success' >> "scraper_$d.log" + else + echo 'db integrity check failed' >> "scraper_$d.log" + rm timetable.db + rm new_meta + fi +elif [ $result -eq 48 ] +then + echo 'db is still valid' >> "scraper_$d.log" + rm new_meta +else + echo 'scraper failed' >> "scraper_$d.log" + rm timetable.db + rm new_meta +fi + +function except { + (rm /home/qeebmisk/timetable.db; rm /home/qeebmisk/new_meta) || : +} diff --git a/database/db.schema b/database/db.schema new file mode 100644 index 0000000000000000000000000000000000000000..2f7de0bddd67bb1dfb80e9e8504c1b1dcf385eb6 --- /dev/null +++ b/database/db.schema @@ -0,0 +1,30 @@ +metadata +-------- +#key +value + +nodes stops +------- --------- +#symbol <____ #id <_________ +name |___ symbol | + number | + lat | + lon | + headsigns | + | +lines timetables | +------ ---------- | +#id <______ #id <________/|\__ +number | stop_id ______| | + |_____ line_id | + headsign | + | + departures | + ---------- | + #id | + timetable_id _____| + hour + minute + mode + lowFloor + modification diff --git a/database/scraper.py b/database/scraper.py new file mode 100755 index 0000000000000000000000000000000000000000..7109b9d730a39c50dfc4babceef440dcbcb85020 --- /dev/null +++ b/database/scraper.py @@ -0,0 +1,237 @@ +#!/bin/python +""" +js interface: http://www.ztm.poznan.pl/themes/ztm/dist/js/app.js +nodes: http://www.ztm.poznan.pl/goeuropa-api/all-nodes +stops in node: http://www.ztm.poznan.pl/goeuropa-api/node_stops/{node:symbol} +stops: http://www.ztm.poznan.pl/goeuropa-api/stops-nodes +bike stations: http://www.ztm.poznan.pl/goeuropa-api/bike-stations + +""" +import json +import os +import re +import sqlite3 +import sys +import requests +from bs4 import BeautifulSoup + +class TimetableDownloader: + """ + downloader class + """ + def __init__(self, verbose): + self.session = requests.session() + self.verbose = verbose + + + def __get_validity(self): + """ + get timetable validity + """ + index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index') + option = re.search('<option value="[0-9]{8}" selected', index.text).group() + return option.split('"')[1] + + + def __get_nodes(self): + """ + get nodes + """ + index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/all-nodes') + return [(stop['symbol'], stop['name']) for stop in json.loads(index.text)] + + + def __get_stops(self, node): + """ + get stops + """ + index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/node_stops/{}'.format(node)) + stops = [] + for stop in json.loads(index.text): + stop_id = stop['stop']['id'] + number = re.findall("\\d+", stop['stop']['symbol'])[0] + lat = stop['stop']['lat'] + lon = stop['stop']['lon'] + directions = ', '.join(['{} → {}'.format(transfer['name'], transfer['headsign']) + for transfer in stop['transfers']]) + stops.append((stop_id, node, number, lat, lon, directions)) + return stops + + + def __get_lines(self): + """ + get lines + """ + index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index') + soup = BeautifulSoup(index.text, 'html.parser') + + lines = {line['data-lineid']: line.text for line in + soup.findAll(attrs={'class': re.compile(r'.*\blineNo-bt\b.*')})} + + return lines + + + def __get_route(self, line_id): + """ + get routes + """ + index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/line-info/{}'.format(line_id)) + soup = BeautifulSoup(index.text, 'html.parser') + directions = soup.findAll(attrs={'class': re.compile(r'.*\baccordion-item\b.*')}) + routes = {} + for direction in directions: + direction_id = direction['data-directionid'] + route = [{'id': stop.find('a')['data-stopid'], 'name': stop['data-name'], + 'onDemand': re.search('stop-onDemand', str(stop['class'])) != None} + for stop in direction.findAll(attrs={'class': re.compile(r'.*\bstop-itm\b.*')})] + routes[direction_id] = route + return routes + + + def __get_stop_times(self, stop_id, line_id, direction_id): + """ + get timetable + """ + index = self.__post('https://www.ztm.poznan.pl/goeuropa-api/stop-info/{}/{}'. + format(stop_id, line_id), {'directionId': direction_id}) + soup = BeautifulSoup(index.text, 'html.parser') + legends = {} + for row in soup.find(attrs={'class': re.compile(r'.*\blegend-box\b.*')}).findAll('li'): + row = row.text.split('-') + row[0] = row[0].rstrip() + row[1] = row[1].lstrip() + if row[0] != '_': + legends[row[0]] = '-'.join(row[1:]) + schedules = {} + for mode in soup.findAll(attrs={'class': re.compile(r'.*\bmode-tab\b.*')}): + mode_name = mode['data-mode'] + schedule = {row.find('th').text: [ + {'time': minute.text, 'lowFloor': re.search('n-line', str(minute['class'])) != None} + for minute in row.findAll('a')] + for row in mode.find(attrs={'class': re.compile(r'.*\bscheduler-hours\b.*')}). + findAll('tr')} + schedule_2 = {hour: times for hour, times in schedule.items() if times != []} + schedule = [] + for hour, deps in schedule_2.items(): + for dep in deps: + schedule.append((hour, *self.__describe(dep['time'], legends), dep['lowFloor'])) + schedules[mode_name] = schedule + + return schedules + + + @staticmethod + def __describe(dep_time, legend): + """ + describe departure + """ + desc = [] + while re.match('^\\d+$', dep_time) is None: + try: + if dep_time[-1] != ',': + desc.append(legend[dep_time[-1]]) + except KeyError: + pass + dep_time = dep_time[:-1] + return (int(dep_time), '; '.join(desc)) + + + def __get(self, url): + try: + return self.session.get(url, verify='bundle.pem') + except: + self.session = requests.session() + return self.session.get(url, verify='bundle.pem') + + + def __post(self, url, data): + try: + return self.session.post(url, data=data, verify='bundle.pem') + except: + self.session = requests.session() + return self.session.post(url, data=data, verify='bundle.pem') + + + def download(self): + """ + main function + """ + if os.path.exists('timetable.db'): + connection = sqlite3.connect('timetable.db') + cursor = connection.cursor() + cursor.execute("select value from metadata where key = 'validFrom'") + current_valid_from = cursor.fetchone()[0] + cursor.close() + connection.close() + if self.__get_validity() <= current_valid_from: + return 304 + else: + os.remove('timetable.db') + + with sqlite3.connect('timetable.db') as connection: + try: + cursor = connection.cursor() + cursor.execute('create table metadata(key TEXT PRIMARY KEY, value TEXT)') + cursor.execute('create table nodes(symbol TEXT PRIMARY KEY, name TEXT)') + cursor.execute('create table stops(id TEXT PRIMARY KEY, symbol TEXT \ + references node(symbol), number TEXT, lat REAL, lon REAL, \ + headsigns TEXT)') + cursor.execute('create table lines(id TEXT PRIMARY KEY, number TEXT)') + cursor.execute('create table timetables(id TEXT PRIMARY KEY, stop_id TEXT references \ + stop(id), line_id TEXT references line(id), headsign TEXT)') + cursor.execute('create table departures(id INTEGER PRIMARY KEY, \ + timetable_id TEXT references timetable(id), \ + hour INTEGER, minute INTEGER, mode TEXT, \ + lowFloor INTEGER, modification TEXT)') + + validity = self.__get_validity() + print(validity) + sys.stdout.flush() + cursor.execute("insert into metadata values('validFrom', ?)", (validity,)) + nodes = self.__get_nodes() + cursor.executemany('insert into nodes values(?, ?)', nodes) + node_i = 1 + for symbol, _ in nodes: + if self.verbose: + print('node {}'.format(node_i)) + stops = self.__get_stops(symbol) + cursor.executemany('insert into stops values(?, ?, ?, ?, ?, ?)', stops) + node_i += 1 + lines = self.__get_lines() + cursor.executemany('insert into lines values(?, ?)', lines.items()) + + timetable_id = 1 + line_i = 1 + for line_id, _ in lines.items(): + route = self.__get_route(line_id) + route_i = 1 + for direction, stops in route.items(): + stop_i = 1 + for stop in stops: + if self.verbose: + print("stop {} in route {} in line {}".format(stop_i, route_i, line_i)) + timetables = self.__get_stop_times(stop['id'], line_id, direction) + cursor.execute('insert into timetables values(?, ?, ?, ?)', + (timetable_id, stop['id'], line_id, stops[-1]['name'])) + for mode, times in timetables.items(): + cursor.executemany('insert into departures values(null, ?, ?, ?, ?, ?, \ + ?)', [(timetable_id, hour, minute, mode, lowfloor, desc) + for hour, minute, desc, lowfloor in times]) + stop_i += 1 + timetable_id += 1 + route_i += 1 + line_i += 1 + except KeyboardInterrupt: + return 404 + return 0 + + +if __name__ == '__main__': + verbose = False + try: + if sys.argv[1] == '-v': + verbose = True + except IndexError: + pass + downloader = TimetableDownloader(verbose) + exit(downloader.download()) diff --git a/research/scraper.py b/research/scraper.py deleted file mode 100755 index 7109b9d730a39c50dfc4babceef440dcbcb85020..0000000000000000000000000000000000000000 --- a/research/scraper.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/bin/python -""" -js interface: http://www.ztm.poznan.pl/themes/ztm/dist/js/app.js -nodes: http://www.ztm.poznan.pl/goeuropa-api/all-nodes -stops in node: http://www.ztm.poznan.pl/goeuropa-api/node_stops/{node:symbol} -stops: http://www.ztm.poznan.pl/goeuropa-api/stops-nodes -bike stations: http://www.ztm.poznan.pl/goeuropa-api/bike-stations - -""" -import json -import os -import re -import sqlite3 -import sys -import requests -from bs4 import BeautifulSoup - -class TimetableDownloader: - """ - downloader class - """ - def __init__(self, verbose): - self.session = requests.session() - self.verbose = verbose - - - def __get_validity(self): - """ - get timetable validity - """ - index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index') - option = re.search('<option value="[0-9]{8}" selected', index.text).group() - return option.split('"')[1] - - - def __get_nodes(self): - """ - get nodes - """ - index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/all-nodes') - return [(stop['symbol'], stop['name']) for stop in json.loads(index.text)] - - - def __get_stops(self, node): - """ - get stops - """ - index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/node_stops/{}'.format(node)) - stops = [] - for stop in json.loads(index.text): - stop_id = stop['stop']['id'] - number = re.findall("\\d+", stop['stop']['symbol'])[0] - lat = stop['stop']['lat'] - lon = stop['stop']['lon'] - directions = ', '.join(['{} → {}'.format(transfer['name'], transfer['headsign']) - for transfer in stop['transfers']]) - stops.append((stop_id, node, number, lat, lon, directions)) - return stops - - - def __get_lines(self): - """ - get lines - """ - index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/index') - soup = BeautifulSoup(index.text, 'html.parser') - - lines = {line['data-lineid']: line.text for line in - soup.findAll(attrs={'class': re.compile(r'.*\blineNo-bt\b.*')})} - - return lines - - - def __get_route(self, line_id): - """ - get routes - """ - index = self.__get('https://www.ztm.poznan.pl/goeuropa-api/line-info/{}'.format(line_id)) - soup = BeautifulSoup(index.text, 'html.parser') - directions = soup.findAll(attrs={'class': re.compile(r'.*\baccordion-item\b.*')}) - routes = {} - for direction in directions: - direction_id = direction['data-directionid'] - route = [{'id': stop.find('a')['data-stopid'], 'name': stop['data-name'], - 'onDemand': re.search('stop-onDemand', str(stop['class'])) != None} - for stop in direction.findAll(attrs={'class': re.compile(r'.*\bstop-itm\b.*')})] - routes[direction_id] = route - return routes - - - def __get_stop_times(self, stop_id, line_id, direction_id): - """ - get timetable - """ - index = self.__post('https://www.ztm.poznan.pl/goeuropa-api/stop-info/{}/{}'. - format(stop_id, line_id), {'directionId': direction_id}) - soup = BeautifulSoup(index.text, 'html.parser') - legends = {} - for row in soup.find(attrs={'class': re.compile(r'.*\blegend-box\b.*')}).findAll('li'): - row = row.text.split('-') - row[0] = row[0].rstrip() - row[1] = row[1].lstrip() - if row[0] != '_': - legends[row[0]] = '-'.join(row[1:]) - schedules = {} - for mode in soup.findAll(attrs={'class': re.compile(r'.*\bmode-tab\b.*')}): - mode_name = mode['data-mode'] - schedule = {row.find('th').text: [ - {'time': minute.text, 'lowFloor': re.search('n-line', str(minute['class'])) != None} - for minute in row.findAll('a')] - for row in mode.find(attrs={'class': re.compile(r'.*\bscheduler-hours\b.*')}). - findAll('tr')} - schedule_2 = {hour: times for hour, times in schedule.items() if times != []} - schedule = [] - for hour, deps in schedule_2.items(): - for dep in deps: - schedule.append((hour, *self.__describe(dep['time'], legends), dep['lowFloor'])) - schedules[mode_name] = schedule - - return schedules - - - @staticmethod - def __describe(dep_time, legend): - """ - describe departure - """ - desc = [] - while re.match('^\\d+$', dep_time) is None: - try: - if dep_time[-1] != ',': - desc.append(legend[dep_time[-1]]) - except KeyError: - pass - dep_time = dep_time[:-1] - return (int(dep_time), '; '.join(desc)) - - - def __get(self, url): - try: - return self.session.get(url, verify='bundle.pem') - except: - self.session = requests.session() - return self.session.get(url, verify='bundle.pem') - - - def __post(self, url, data): - try: - return self.session.post(url, data=data, verify='bundle.pem') - except: - self.session = requests.session() - return self.session.post(url, data=data, verify='bundle.pem') - - - def download(self): - """ - main function - """ - if os.path.exists('timetable.db'): - connection = sqlite3.connect('timetable.db') - cursor = connection.cursor() - cursor.execute("select value from metadata where key = 'validFrom'") - current_valid_from = cursor.fetchone()[0] - cursor.close() - connection.close() - if self.__get_validity() <= current_valid_from: - return 304 - else: - os.remove('timetable.db') - - with sqlite3.connect('timetable.db') as connection: - try: - cursor = connection.cursor() - cursor.execute('create table metadata(key TEXT PRIMARY KEY, value TEXT)') - cursor.execute('create table nodes(symbol TEXT PRIMARY KEY, name TEXT)') - cursor.execute('create table stops(id TEXT PRIMARY KEY, symbol TEXT \ - references node(symbol), number TEXT, lat REAL, lon REAL, \ - headsigns TEXT)') - cursor.execute('create table lines(id TEXT PRIMARY KEY, number TEXT)') - cursor.execute('create table timetables(id TEXT PRIMARY KEY, stop_id TEXT references \ - stop(id), line_id TEXT references line(id), headsign TEXT)') - cursor.execute('create table departures(id INTEGER PRIMARY KEY, \ - timetable_id TEXT references timetable(id), \ - hour INTEGER, minute INTEGER, mode TEXT, \ - lowFloor INTEGER, modification TEXT)') - - validity = self.__get_validity() - print(validity) - sys.stdout.flush() - cursor.execute("insert into metadata values('validFrom', ?)", (validity,)) - nodes = self.__get_nodes() - cursor.executemany('insert into nodes values(?, ?)', nodes) - node_i = 1 - for symbol, _ in nodes: - if self.verbose: - print('node {}'.format(node_i)) - stops = self.__get_stops(symbol) - cursor.executemany('insert into stops values(?, ?, ?, ?, ?, ?)', stops) - node_i += 1 - lines = self.__get_lines() - cursor.executemany('insert into lines values(?, ?)', lines.items()) - - timetable_id = 1 - line_i = 1 - for line_id, _ in lines.items(): - route = self.__get_route(line_id) - route_i = 1 - for direction, stops in route.items(): - stop_i = 1 - for stop in stops: - if self.verbose: - print("stop {} in route {} in line {}".format(stop_i, route_i, line_i)) - timetables = self.__get_stop_times(stop['id'], line_id, direction) - cursor.execute('insert into timetables values(?, ?, ?, ?)', - (timetable_id, stop['id'], line_id, stops[-1]['name'])) - for mode, times in timetables.items(): - cursor.executemany('insert into departures values(null, ?, ?, ?, ?, ?, \ - ?)', [(timetable_id, hour, minute, mode, lowfloor, desc) - for hour, minute, desc, lowfloor in times]) - stop_i += 1 - timetable_id += 1 - route_i += 1 - line_i += 1 - except KeyboardInterrupt: - return 404 - return 0 - - -if __name__ == '__main__': - verbose = False - try: - if sys.argv[1] == '-v': - verbose = True - except IndexError: - pass - downloader = TimetableDownloader(verbose) - exit(downloader.download())