Bimba.git

commit e7b4002ca5b16dfcbf932e49c7b4df8aa28866f4

Author: Adam Pioterek <adam.pioterek@protonmail.ch>

scrapper only runs when changed timetable

 research/scraper.py | 49 +++++++++++++++++++++++++++++++++-------------


diff --git a/research/scraper.py b/research/scraper.py
index 96b855fb4275f76a18daad2486598937c3391e31..1315a535dbb72bd0828b629cb9ca3df3f119e232 100755
--- a/research/scraper.py
+++ b/research/scraper.py
@@ -8,14 +8,22 @@ bike stations: http://www.ztm.poznan.pl/goeuropa-api/bike-stations
 
 """
 import json
-import hashlib
 import os
 import re
 import sqlite3
 import sys
-import time
 import requests
 from bs4 import BeautifulSoup
+
+def get_validity():
+    """
+    get timetable validity
+    """
+    session = requests.session()
+    index = session.get('https://www.ztm.poznan.pl/goeuropa-api/index', verify='bundle.pem')
+    option = re.search('<option value="[0-9]{8}" selected', index.text).group()
+    return option.split('"')[1]
+
 
 def get_nodes():
     """
@@ -111,7 +119,7 @@             for dep in deps:
                 schedule.append((hour, *describe(dep['time'], legends), dep['lowFloor']))
         schedules[mode_name] = schedule
 
-    return schedules, hashlib.sha512(index.text.encode('utf-8')).hexdigest()
+    return schedules, ''
 
 
 def describe(dep_time, legend):
@@ -120,8 +128,11 @@     describe departure
     """
     desc = []
     while re.match('^\\d+$', dep_time) is None:
-        if dep_time[-1] != ',':
-            desc.append(legend[dep_time[-1]])
+        try:
+            if dep_time[-1] != ',':
+                desc.append(legend[dep_time[-1]])
+        except KeyError:
+            pass
         dep_time = dep_time[:-1]
     return (int(dep_time), '; '.join(desc))
 
@@ -130,13 +141,28 @@ def main():
     """
     main function
     """
+    validity = get_validity()
     if os.path.exists('timetable.db'):
-        return
+        try:
+            connection = sqlite3.connect('timetable.db')
+            cursor = connection.cursor()
+            cursor.execute("select value from metadata where key = 'validFrom'")
+            current_valid_from = cursor.fetchone()[0]
+            connection.close()
+            if validity <= current_valid_from:
+                return 304
+            else:
+                os.remove('timetable.db')
+        except:
+            connection.close()
+            os.remove('timetable.db')
 
-    print(time.time())
+    print(validity)
+    sys.stdout.flush()
     with sqlite3.connect('timetable.db') as connection:
         print('creating tables')
         cursor = connection.cursor()
+        cursor.execute('create table metadata(key TEXT PRIMARY KEY, value TEXT)')
         cursor.execute('create table nodes(symbol TEXT PRIMARY KEY, name TEXT)')
         cursor.execute('create table stops(id TEXT PRIMARY KEY, symbol TEXT \
                         references node(symbol), number TEXT, lat REAL, lon REAL, headsigns TEXT)')
@@ -155,8 +181,7 @@         nodes_no = len(nodes)
         print('getting stops')
         node_i = 1
         for symbol, _ in nodes:
-            print('\rstop {}/{}'.format(node_i, nodes_no), end='')
-            sys.stdout.flush()
+            print('stop {}/{}'.format(node_i, nodes_no))
             cursor.executemany('insert into stops values(?, ?, ?, ?, ?, ?);', get_stops(symbol))
             node_i += 1
         print('')
@@ -174,8 +199,7 @@                 stops_no = len(stops)
                 stop_i = 1
                 for stop in stops:
                     print('line {}/{} route {}/{} stop {}/{}'.
-                          format(line_i, lines_no, route_i, routes_no, stop_i, stops_no), end='')
-                    sys.stdout.flush()
+                          format(line_i, lines_no, route_i, routes_no, stop_i, stops_no))
                     timetables, checksum = get_stop_times(stop['id'], line_id, direction)
                     cursor.execute('insert into timetables values(?, ?, ?, ?, ?);',
                                    (tti, stop['id'], line_id, stops[-1]['name'], checksum))
@@ -185,12 +209,9 @@                                            [(tti, hour, minute, mode, lowfloor, desc)
                                             for hour, minute, desc, lowfloor in times])
                     stop_i += 1
                     tti += 1
-                    print('{}\r'.format(' '*35), end='')
-                    sys.stdout.flush()
                 route_i += 1
             print('')
             line_i += 1
-    print(time.time())
 
 
 if __name__ == '__main__':