From 24429ba1f56dcc0baf6c73217e63e15428efbb2e Mon Sep 17 00:00:00 2001 From: poka Date: Thu, 14 May 2020 03:56:31 +0200 Subject: [PATCH] Add transiscope scraping and add .gitignore file ... --- .gitignore | 2 ++ .../transiscope/explore_transiscope.sh | 9 +++++++++ .../transiscope/generate_transiscope.sh | 7 +++++++ .../scraping/transiscope/tests_scrap/scrap.py | 9 +++++++++ .../scraping/transiscope/tests_scrap/scrap3.py | 15 +++++++++++++++ .../transiscope/tests_scrap/scrapsoup.py | 18 ++++++++++++++++++ 6 files changed, 60 insertions(+) create mode 100644 .gitignore create mode 100755 zen/tools/scraping/transiscope/explore_transiscope.sh create mode 100755 zen/tools/scraping/transiscope/generate_transiscope.sh create mode 100755 zen/tools/scraping/transiscope/tests_scrap/scrap.py create mode 100755 zen/tools/scraping/transiscope/tests_scrap/scrap3.py create mode 100755 zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c8dc066 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.profile +zen/tools/scraping/transiscope/transiscope.json diff --git a/zen/tools/scraping/transiscope/explore_transiscope.sh b/zen/tools/scraping/transiscope/explore_transiscope.sh new file mode 100755 index 0000000..aaa7caa --- /dev/null +++ b/zen/tools/scraping/transiscope/explore_transiscope.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [[ ! -f transiscope.json ]]; then + echo "Premier lancement, récupération des données, veuillez patientez ..." + ./generate_transiscope.sh +fi + +cat transiscope.json | jq '.[] | .name, .abstract, .geo' + +exit 0 diff --git a/zen/tools/scraping/transiscope/generate_transiscope.sh b/zen/tools/scraping/transiscope/generate_transiscope.sh new file mode 100755 index 0000000..7be1093 --- /dev/null +++ b/zen/tools/scraping/transiscope/generate_transiscope.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +curl -s https://transiscope.gogocarto.fr/api/elements | jq .data > /tmp/tmp_transiscope.json || exit 1 +[[ -f transiscope.json ]] && rm transiscope.json +mv /tmp/tmp_transiscope.json transiscope.json + +exit 0 diff --git a/zen/tools/scraping/transiscope/tests_scrap/scrap.py b/zen/tools/scraping/transiscope/tests_scrap/scrap.py new file mode 100755 index 0000000..34e5413 --- /dev/null +++ b/zen/tools/scraping/transiscope/tests_scrap/scrap.py @@ -0,0 +1,9 @@ +#!/usr/bin/python3 + +import cloudscraper + +url = "https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all" + +scraper = cloudscraper.create_scraper() +#scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session +print(scraper.get(url).content) diff --git a/zen/tools/scraping/transiscope/tests_scrap/scrap3.py b/zen/tools/scraping/transiscope/tests_scrap/scrap3.py new file mode 100755 index 0000000..39d80e6 --- /dev/null +++ b/zen/tools/scraping/transiscope/tests_scrap/scrap3.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + + +import requests +from parsel import Selector + + +#url = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all' +url = 'https://www.kurzy.cz/banky/bankomaty/zatec-okres-louny/' +r = requests.get(url) +sel = Selector(r.text) +all_address = sel.xpath('//script[contains(.,"point_list")]').re_first(r'point_list = \[(.*)\]\];') + +for item in all_address.split(','): + print(item) diff --git a/zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py b/zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py new file mode 100755 index 0000000..f83ea41 --- /dev/null +++ b/zen/tools/scraping/transiscope/tests_scrap/scrapsoup.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 + +from bs4 import BeautifulSoup +import urllib.request +import csv + +urlpage = 'https://transiscope.org/carte-des-alternatives/#/carte/@46.33,-1.34,6z?cat=all' + + +# query the website and return the html to the variable 'page' +page = urllib.request.urlopen(urlpage) +# parse the html using beautiful soup and store in variable 'soup' +soup = BeautifulSoup(page, 'html.parser') + + +table = soup.find(attrs={'id': 'element-info'}) +results = table.find_all('li') +print('Number of results', len(results))