Author: Adam Pioterek <adam.pioterek@protonmail.ch>
server-side timetable converter
converter/bundle.pem | 83 ++++++++++ converter/converter.py | 351 ++++++++++++++++++++++++++++++++++++++++++++
diff --git a/converter/bundle.pem b/converter/bundle.pem new file mode 100644 index 0000000000000000000000000000000000000000..12585890132ce18cf3ebf0ac2819017cba1e3123 --- /dev/null +++ b/converter/bundle.pem @@ -0,0 +1,83 @@ +-----BEGIN CERTIFICATE----- +MIIDVDCCAjygAwIBAgIDAjRWMA0GCSqGSIb3DQEBBQUAMEIxCzAJBgNVBAYTAlVT +MRYwFAYDVQQKEw1HZW9UcnVzdCBJbmMuMRswGQYDVQQDExJHZW9UcnVzdCBHbG9i +YWwgQ0EwHhcNMDIwNTIxMDQwMDAwWhcNMjIwNTIxMDQwMDAwWjBCMQswCQYDVQQG +EwJVUzEWMBQGA1UEChMNR2VvVHJ1c3QgSW5jLjEbMBkGA1UEAxMSR2VvVHJ1c3Qg +R2xvYmFsIENBMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA2swYYzD9 +9BcjGlZ+W988bDjkcbd4kdS8odhM+KhDtgPpTSEHCIjaWC9mOSm9BXiLnTjoBbdq +fnGk5sRgprDvgOSJKA+eJdbtg/OtppHHmMlCGDUUna2YRpIuT8rxh0PBFpVXLVDv +iS2Aelet8u5fa9IAjbkU+BQVNdnARqN7csiRv8lVK83Qlz6cJmTM386DGXHKTubU +1XupGc1V3sjs0l44U+VcT4wt/lAjNvxm5suOpDkZALeVAjmRCw7+OC7RHQWa9k0+ +bw8HHa8sHo9gOeL6NlMTOdReJivbPagUvTLrGAMoUgRx5aszPeE4uwc2hGKceeoW +MPRfwCvocWvk+QIDAQABo1MwUTAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBTA +ephojYn7qwVkDBF9qn1luMrMTjAfBgNVHSMEGDAWgBTAephojYn7qwVkDBF9qn1l +uMrMTjANBgkqhkiG9w0BAQUFAAOCAQEANeMpauUvXVSOKVCUn5kaFOSPeCpilKIn +Z57QzxpeR+nBsqTP3UEaBU6bS+5Kb1VSsyShNwrrZHYqLizz/Tt1kL/6cdjHPTfS +tQWVYrmm3ok9Nns4d0iXrKYgjy6myQzCsplFAMfOEVEiIuCl6rYVSAlk6l5PdPcF +PseKUgzbFbS9bZvlxrFUaKnjaZC2mqUPuLk/IH2uSrW4nOQdtqvmlKXBx4Ot2/Un +hw4EbNX/3aBd7YdStysVAq45pmp06drE57xNNB6pXE0zX5IJL4hmXXeXxx12E6nV +5fEWCRE11azbJHFwLJhWC9kXtNHjUStedejV0NxPNO3CBWaAocvmMw== +-----END CERTIFICATE----- +-----BEGIN CERTIFICATE----- +MIIETTCCAzWgAwIBAgIDAjpxMA0GCSqGSIb3DQEBCwUAMEIxCzAJBgNVBAYTAlVT +MRYwFAYDVQQKEw1HZW9UcnVzdCBJbmMuMRswGQYDVQQDExJHZW9UcnVzdCBHbG9i +YWwgQ0EwHhcNMTMxMjExMjM0NTUxWhcNMjIwNTIwMjM0NTUxWjBCMQswCQYDVQQG +EwJVUzEWMBQGA1UEChMNR2VvVHJ1c3QgSW5jLjEbMBkGA1UEAxMSUmFwaWRTU0wg +U0hBMjU2IENBMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAu1jBEgEu +l9h9GKrIwuWF4hdsYC7JjTEFORoGmFbdVNcRjFlbPbFUrkshhTIWX1SG5tmx2GCJ +a1i+ctqgAEJ2sSdZTM3jutRc2aZ/uyt11UZEvexAXFm33Vmf8Wr3BvzWLxmKlRK6 +msrVMNI4/Bk7WxU7NtBDTdFlodSLwWBBs9ZwF8w5wJwMoD23ESJOztmpetIqYpyg +C04q18NhWoXdXBC5VD0tA/hJ8LySt7ecMcfpuKqCCwW5Mc0IW7siC/acjopVHHZD +dvDibvDfqCl158ikh4tq8bsIyTYYZe5QQ7hdctUoOeFTPiUs2itP3YqeUFDgb5rE +1RkmiQF1cwmbOwIDAQABo4IBSjCCAUYwHwYDVR0jBBgwFoAUwHqYaI2J+6sFZAwR +fap9ZbjKzE4wHQYDVR0OBBYEFJfCJ1CewsnsDIgyyHyt4qYBT9pvMBIGA1UdEwEB +/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDYGA1UdHwQvMC0wK6ApoCeGJWh0 +dHA6Ly9nMS5zeW1jYi5jb20vY3Jscy9ndGdsb2JhbC5jcmwwLwYIKwYBBQUHAQEE +IzAhMB8GCCsGAQUFBzABhhNodHRwOi8vZzIuc3ltY2IuY29tMEwGA1UdIARFMEMw +QQYKYIZIAYb4RQEHNjAzMDEGCCsGAQUFBwIBFiVodHRwOi8vd3d3Lmdlb3RydXN0 +LmNvbS9yZXNvdXJjZXMvY3BzMCkGA1UdEQQiMCCkHjAcMRowGAYDVQQDExFTeW1h +bnRlY1BLSS0xLTU2OTANBgkqhkiG9w0BAQsFAAOCAQEANevhiyBWlLp6vXmp9uP+ +bji0MsGj21hWID59xzqxZ2nVeRQb9vrsYPJ5zQoMYIp0TKOTKqDwUX/N6fmS/Zar +RfViPT9gRlATPSATGC6URq7VIf5Dockj/lPEvxrYrDrK3maXI67T30pNcx9vMaJR +BBZqAOv5jUOB8FChH6bKOvMoPF9RrNcKRXdLDlJiG9g4UaCSLT+Qbsh+QJ8gRhVd +4FB84XavXu0R0y8TubglpK9YCa81tGJUheNI3rzSkHp6pIQNo0LyUcDUrVNlXWz4 +Px8G8k/Ll6BKWcZ40egDuYVtLLrhX7atKz4lecWLVtXjCYDqwSfC2Q7sRwrp0Mr8 +2A== +-----END CERTIFICATE----- +-----BEGIN CERTIFICATE----- +MIIGZTCCBU2gAwIBAgIQCWJPhs8z+oLr80YZnKUBTjANBgkqhkiG9w0BAQsFADBC +MQswCQYDVQQGEwJVUzEWMBQGA1UEChMNR2VvVHJ1c3QgSW5jLjEbMBkGA1UEAxMS +UmFwaWRTU0wgU0hBMjU2IENBMB4XDTE3MDYyMjAwMDAwMFoXDTE4MDYyMjIzNTk1 +OVowHDEaMBgGA1UEAwwRd3d3Lnp0bS5wb3puYW4ucGwwggIiMA0GCSqGSIb3DQEB +AQUAA4ICDwAwggIKAoICAQCn/bPI4eRB/t1KyfVyb7QtesUzDQ0xgqUg/I4SasJZ +wj1G1fmkP9r7e1pACZ3SVrZwezqfa0dpRtSt6RYyqI70hdklLRAc4PsQb6ZWaOuz +Lg118OK4vDBAono+AlD0vNEdeF1vunQHsGCKvZ4V9SrNv3v86xw1iHZB1E3zupXx +xLrJSuD4D3HfUaiVwJnE0rF/2b2SF9WstSSczJm5b31uI8+CbCgThv380J/sMAKn +0UKHaxd+qZWbDMIS0OLc6kdFnm2fNrmwiSNtnE6bMaZBq9igCA41hMriGgBH2h0q +6AqlxK47DCDRi08H3DEBmi0mRmHdxu2AbEnHPPZuhOUvoHIwAiirI3Siivy2CLm1 +mBoHtrgK9OoID6bsSjhPG/zDBT4wd5QAOwNSQ9ncQy0lEyC3pQ80P4pVo8Oq0HlW +BPmlUWfD7C7Dh+h0RMP+aN0KD9oDL7Fx9zNZBjqqYSuqczPZIcSPi7c9YP4g/c2r +SgmhAl/4EDN6UVZy8lWyD178OL5GoeCxWaxspCry5h5PAMsr+sftgXx9/Z4ymi/8 +W6LUfeCmehmtxHWoTpCgT52Q9wQmqnZphj9KLNCqKpJRNV5YrAxc4ZDEJaxZI/ZS +TFptJKVJstBKfeBw203gaye8U4xiVxrSLOJqEL65kjZg1LOSCn315iPH7zTR1xS2 +GQIDAQABo4ICezCCAncwHAYDVR0RBBUwE4IRd3d3Lnp0bS5wb3puYW4ucGwwCQYD +VR0TBAIwADArBgNVHR8EJDAiMCCgHqAchhpodHRwOi8vZ3Auc3ltY2IuY29tL2dw +LmNybDBvBgNVHSAEaDBmMGQGBmeBDAECATBaMCoGCCsGAQUFBwIBFh5odHRwczov +L3d3dy5yYXBpZHNzbC5jb20vbGVnYWwwLAYIKwYBBQUHAgIwIAweaHR0cHM6Ly93 +d3cucmFwaWRzc2wuY29tL2xlZ2FsMB8GA1UdIwQYMBaAFJfCJ1CewsnsDIgyyHyt +4qYBT9pvMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYB +BQUHAwIwVwYIKwYBBQUHAQEESzBJMB8GCCsGAQUFBzABhhNodHRwOi8vZ3Auc3lt +Y2QuY29tMCYGCCsGAQUFBzAChhpodHRwOi8vZ3Auc3ltY2IuY29tL2dwLmNydDCC +AQMGCisGAQQB1nkCBAIEgfQEgfEA7wB2AN3rHSt6DU+mIIuBrYFocH4ujp0B1VyI +jT0RxM227L7MAAABXM9F02oAAAQDAEcwRQIgXZ3Ri+/TqCOBrN0h1bgZMeaMUGCq +Z+X3x32HnilBFqwCIQDYH4+Vkc5PKYGU8zwSDPa5flQlOrnQSoZ1MmGzJzzWeAB1 +AKS5CZC0GFgUh7sTosxncAo8NZgE+RvfuON3zQ7IDdwQAAABXM9F06EAAAQDAEYw +RAIgHPwBJvn48rGK4XlKWYDaoA2JYu0ncaQVyJTOsccc64MCIGjc42DqWCyhSUAA +TMFz6GVsglqfvv90bCk3yEoLyEQoMA0GCSqGSIb3DQEBCwUAA4IBAQCmXR1Pmo8J +uMqVJe6NXklEmoFwM1UJ4td8fLrwMEYwaknFSmaB6JkjOS5cZ3Segb9XBY4sT9Ya +AvF2kdOKA339UaSd+yimYYLt/KhO1nnYlNKrfGcjTnybuyczUDxsYR4N8FbTI9Yr +Z0baIlbVmSax9tjxwFMLWkgcd6tbwzCs+XPjvKMwQgxfuu14a5lI9EMsE1jjgxpa +jhedn5fNtKZdgaY9NVOsTkPSw79CzQfgSamSONyWaT949maFRe//sZPBOnE4jTBb +VJrNbfm+H+NFv7bF1Js2xPkgJuGkuKVc1H1qoJX4lf1vfpaZSSmzbe2O6GXHKF4y +MKlNUKhgoBO4 +-----END CERTIFICATE----- diff --git a/converter/converter.py b/converter/converter.py new file mode 100755 index 0000000000000000000000000000000000000000..ef7a248b0ea2fb23b7b02abc509abc6a4b7cd017 --- /dev/null +++ b/converter/converter.py @@ -0,0 +1,351 @@ +#!/bin/python3 +# -*- coding: UTF-8 -*- +import yaml +import csv +import sqlite3 +import requests +from bs4 import BeautifulSoup +import re +import os +from datetime import date +import zipfile +import io +from pathlib import Path +import hashlib +import gzip +import shutil + + +class TimetableDownloader: + def __init__(self): + self.__converter = TimetableConverter() + + def __call__(self): + self.__download() + self.__tidy_up() + + with open('metadata.yml', 'w') as metadata_file: + yaml.dump(self.__metadata, metadata_file) + + def __download(self): + self.__session = requests.session() + html = self.__session.get( + 'https://www.ztm.poznan.pl/pl/dla-deweloperow/gtfsFiles', + verify='bundle.pem') + soup = BeautifulSoup(html.text, 'html.parser') + names_table = [x.string.replace('.zip', '') for x in + soup.find_all('table')[-1] + .tbody + .find_all('td', + string=re.compile('[0-9]{8}_[0-9]{8}'))] + + if not os.path.isfile('metadata.yml'): + self.__metadata = [] + else: + with open('metadata.yml', 'r+') as metadata_file: + self.__metadata = yaml.load(metadata_file.read()) + + to_download = [x for x in names_table if + self.__is_valid(x) or self.__will_valid(x)] + + to_download = self.__clean_overlapping(to_download) + to_download = self.__select_not_had(to_download) + + for file in to_download: + print('getting {}.zip'.format(file)) + self.__get_timetable(file) + checksum = self.__converter() + for p in Path('.').glob('*.txt'): + p.unlink() + size_u = os.path.getsize('timetable.db') + self.__compress(checksum) + self.__archive(file, checksum, size_u, os.path + .getsize('{}.db.gz'.format(checksum))) + + def __is_valid(self, name): + today = date.today().strftime('%Y%m%d') + start, end = name.split('_') + return start <= today and today <= end + + def __will_valid(self, name): + today = date.today().strftime('%Y%m%d') + start, end = name.split('_') + return today < start + + def __clean_overlapping(self, names): + names.sort() + if len(names) == 1: + return names + return_names = [] + i = 1 + for name in names[1:]: + this_start, this_end = name.split('_') + prev_start, prev_end = names[i-1].split('_') + if this_start <= prev_start or this_end > prev_end: + return_names.append(names[i-1]) + i = i + 1 + return_names.append(names[-1]) + return return_names + + def __select_not_had(self, names): + had = ['_'.join((x['start'], x['end'])) for x in self.__metadata] + return_names = [] + for name in names: + if name not in had: + return_names.append(name) + return return_names + + def __get_timetable(self, name): + response = self.__session.get('https://www.ztm.poznan.pl/pl/\ +dla-deweloperow/getGTFSFile?file={}.zip' + .format(name), verify='bundle.pem') + zip_bytes = io.BytesIO(response.content) + with zipfile.ZipFile(zip_bytes, 'r') as zip_file: + zip_file.extractall() + + def __compress(self, checksum): + with open('timetable.db', 'rb') as f_in: + with gzip.open('{}.db.gz'.format(checksum), 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + Path('timetable.db').unlink() + + def __archive(self, name, checksum, size_u, size_c): + metadata = {'size_uncompressed': size_u, 'size_compressed': size_c, + 'id': checksum} + start_date, end_date = name.split('_') + metadata['start'] = start_date + metadata['end'] = end_date + self.__metadata.append(metadata) + + def __tidy_up(self): + names = ['_'.join((row['start'], row['end'])) for row in + self.__metadata] + to_stay = [name for name in names if self.__is_valid(name) + or self.__will_valid(name)] + to_stay = self.__clean_overlapping(to_stay) + to_remove = [name for name in names if name not in to_stay] + to_remove = [row['id'] for row in self.__metadata + if '_'.join((row['start'], row['end'])) in to_remove] + + for name in to_remove: + Path('{}.db.gz'.format(name)).unlink + + +class TimetableConverter: + __BUF_SIZE = 65536 + + def __call__(self): + return self.__convert() + + def __convert(self): + connection = sqlite3.connect('timetable.db') + self.__cursor = connection.cursor() + self.__create_tables() + self.__insert_agency() + self.__insert_stops() + self.__insert_routes() + self.__insert_trips() + self.__insert_stop_times() + self.__insert_calendar() + self.__insert_calendar_dates() + self.__insert_shapes() + self.__insert_feed_info() + + self.__create_indexes() + + connection.commit() + checksum = self.__hash_file('timetable.db') + print(checksum) + return checksum + + @classmethod + def __hash_file(cls, file): + checksum = hashlib.sha256() + + with open(file, 'rb') as f: + while True: + data = f.read(cls.__BUF_SIZE) + if not data: + break + checksum.update(data) + + return checksum.hexdigest() + + def __insert_agency(self): + with open('agency.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into agency values(?, ?, ?, ?, ?, + ?)''', row) + + def __insert_stops(self): + with open('stops.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into stops values(?, ?, ?, ?, ?, + ?)''', row) + + def __insert_routes(self): + with open('routes.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into routes values(?, ?, ?, ?, ?, + ?, ?, ?)''', row) + + def __insert_trips(self): + with open('trips.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into trips values(?, ?, ?, ?, ?, + ?, ?)''', row) + + def __insert_stop_times(self): + with open('stop_times.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into stop_times values(?, ?, ?, + ?, ?, ?, ?, ?)''', row) + + def __insert_calendar(self): + with open('calendar.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into calendar values(?, ?, ?, ?, + ?, ?, ?, ?, ?, ?)''', row) + + def __insert_calendar_dates(self): + with open('calendar_dates.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into calendar_dates values(?, ?, + ?)''', row) + + def __insert_shapes(self): + with open('shapes.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into shapes values(?, ?, ?, + ?)''', row) + + def __insert_feed_info(self): + with open('feed_info.txt', 'r') as csvfile: + is_header_parsed = False + reader = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in reader: + if not is_header_parsed: + is_header_parsed = True + continue + self.__cursor.execute('''insert into feed_info values(?, ?, ?, ?, + ?)''', row) + + def __create_tables(self): + self.__cursor.execute('''create table agency(agency_id TEXT PRIMARY KEY, + agency_name TEXT, + agency_url TEXT, + agency_timezone TEXT, + agency_phone TEXT, + agency_lang TEXT)''') + self.__cursor.execute('''create table stops(stop_id TEXT PRIMARY KEY, + stop_code TEXT, + stop_name TEXT, + stop_lat DOUBLE, + stop_lon DOUBLE, + zone_id TEXT)''') + self.__cursor.execute('''create table routes(route_id TEXT PRIMARY KEY, + agency_id TEXT, + route_short_name TEXT, + route_long_name TEXT, + route_desc TEXT, + route_type INTEGER, + route_color TEXT, + route_text_color TEXT, + FOREIGN KEY(agency_id) REFERENCES agency(agency_id))''') + self.__cursor.execute('''create table trips(route_id TEXT, + service_id TEXT, + trip_id TEXT PRIMARY KEY, + trip_headsign TEXT, + direction_id INTEGER, + shape_id TEXT, + wheelchair_accessible BOOL, + FOREIGN KEY(route_id) REFERENCES routes(route_id), + FOREIGN KEY(service_id) REFERENCES calendar(service_id), + FOREIGN KEY(shape_id) REFERENCES shapes(shape_id))''') + self.__cursor.execute('''create table stop_times(trip_id TEXT, + arrival_time TEXT, + departure_time TEXT, + stop_id TEXT, + stop_sequence INTEGER, + stop_headsign TEXT, + pickup_type INTEGER, + drop_off_type INTEGER, + FOREIGN KEY(trip_id) REFERENCES trips(trip_id), + FOREIGN KEY(stop_id) REFERENCES stops(stop_id))''') + self.__cursor.execute('''create table calendar(service_id TEXT PRIMARY KEY, + monday TEXT, + tuesday TEXT, + wednesday TEXT, + thursday TEXT, + friday TEXT, + saturday TEXT, + sunday TEXT, + start_date TEXT, + end_date TEXT)''') + self.__cursor.execute('''create table calendar_dates(service_id TEXT, + date TEXT, + exception_type INTEGER, + FOREIGN KEY(service_id) REFERENCES calendar(service_id))''') + self.__cursor.execute('''create table shapes(shape_id TEXT, + shape_pt_lat DOUBLE, + shape_pt_lon DOUBLE, + shape_pt_sequence INTEGER)''') + self.__cursor.execute('''create table feed_info(feed_publisher_name TEXT, + feed_publisher_url TEXT, + feed_lang TEXT, + feed_start_date TEXT, + feed_end_date TEXT)''') + + def __create_indexes(self): + self.__cursor.execute('''create index ix_stop_times__stop + on stop_times(stop_id)''') + self.__cursor.execute('''create index ix_stop_times__trip + on stop_times(trip_id)''') + self.__cursor.execute('''create index ix_shapes__shape + on shapes(shape_id)''') + + +if __name__ == '__main__': + downloader = TimetableDownloader() + downloader()