Add transiscope scraping and add .gitignore file ...

This commit is contained in:
poka 2020-05-14 03:56:31 +02:00
parent 3a97143d5d
commit 24429ba1f5
6 changed files with 60 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.profile
zen/tools/scraping/transiscope/transiscope.json

View File

@ -0,0 +1,9 @@
#!/bin/bash
if [[ ! -f transiscope.json ]]; then
echo "Premier lancement, récupération des données, veuillez patientez ..."
./generate_transiscope.sh
fi
cat transiscope.json | jq '.[] | .name, .abstract, .geo'
exit 0

View File

@ -0,0 +1,7 @@
#!/bin/bash
curl -s https://transiscope.gogocarto.fr/api/elements | jq .data > /tmp/tmp_transiscope.json || exit 1
[[ -f transiscope.json ]] && rm transiscope.json
mv /tmp/tmp_transiscope.json transiscope.json
exit 0

View File

@ -0,0 +1,9 @@
#!/usr/bin/python3
import cloudscraper
url = "https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all"
scraper = cloudscraper.create_scraper()
#scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session
print(scraper.get(url).content)

View File

@ -0,0 +1,15 @@
#!/usr/bin/python3
import requests
from parsel import Selector
#url = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all'
url = 'https://www.kurzy.cz/banky/bankomaty/zatec-okres-louny/'
r = requests.get(url)
sel = Selector(r.text)
all_address = sel.xpath('//script[contains(.,"point_list")]').re_first(r'point_list = \[(.*)\]\];')
for item in all_address.split(','):
print(item)

View File

@ -0,0 +1,18 @@
#!/usr/bin/python3
from bs4 import BeautifulSoup
import urllib.request
import csv
urlpage = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all'
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
table = soup.find(attrs={'id': 'element-info'})
results = table.find_all('li')
print('Number of results', len(results))