Add transiscope scraping and add .gitignore file ...

2020-05-14 03:56:31 +02:00 · 2020-05-14 03:56:31 +02:00 · 24429ba1f5
parent 3a97143d5d
commit 24429ba1f5
6 changed files with 60 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.profile
+zen/tools/scraping/transiscope/transiscope.json
--- a/zen/tools/scraping/transiscope/explore_transiscope.sh
+++ b/zen/tools/scraping/transiscope/explore_transiscope.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+if [[ ! -f transiscope.json ]]; then
+	echo "Premier lancement, récupération des données, veuillez patientez ..."
+	./generate_transiscope.sh
+fi
+
+cat transiscope.json | jq '.[] | .name, .abstract, .geo'
+
+exit 0
--- a/zen/tools/scraping/transiscope/generate_transiscope.sh
+++ b/zen/tools/scraping/transiscope/generate_transiscope.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+curl -s https://transiscope.gogocarto.fr/api/elements | jq .data > /tmp/tmp_transiscope.json || exit 1
+[[ -f transiscope.json ]] && rm transiscope.json
+mv /tmp/tmp_transiscope.json transiscope.json
+
+exit 0
--- a/zen/tools/scraping/transiscope/tests_scrap/scrap.py
+++ b/zen/tools/scraping/transiscope/tests_scrap/scrap.py
@ -0,0 +1,9 @@
+#!/usr/bin/python3
+
+import cloudscraper
+
+url = "https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all"
+
+scraper = cloudscraper.create_scraper()
+#scraper = cloudscraper.CloudScraper()  # CloudScraper inherits from requests.Session
+print(scraper.get(url).content)
--- a/zen/tools/scraping/transiscope/tests_scrap/scrap3.py
+++ b/zen/tools/scraping/transiscope/tests_scrap/scrap3.py
@ -0,0 +1,15 @@
+#!/usr/bin/python3
+
+
+import requests
+from parsel import Selector
+
+
+#url = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all'
+url = 'https://www.kurzy.cz/banky/bankomaty/zatec-okres-louny/'
+r = requests.get(url)
+sel = Selector(r.text)
+all_address = sel.xpath('//script[contains(.,"point_list")]').re_first(r'point_list = \[(.*)\]\];')
+
+for item in all_address.split(','):
+    print(item)
--- a/zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py
+++ b/zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py
@ -0,0 +1,18 @@
+#!/usr/bin/python3
+
+from bs4 import BeautifulSoup
+import urllib.request
+import csv
+
+urlpage = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all'
+
+
+# query the website and return the html to the variable 'page'
+page = urllib.request.urlopen(urlpage)
+# parse the html using beautiful soup and store in variable 'soup'
+soup = BeautifulSoup(page, 'html.parser')
+
+
+table = soup.find(attrs={'id': 'element-info'})
+results = table.find_all('li')
+print('Number of results', len(results))