1. Debian 9 + Python 3.5.3javascript
"python3 --version"
html
link_crawler3.py
# -*- coding: utf-8 -*- import re import queue import time from common import download from urllib import request from urllib import robotparser from urllib.parse import urljoin from urllib.parse import urlparse from datetime import datetime def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1): """ Crawl from the given seed URL following links matched by link_regex """ # the queue of URL's that still need to be crawled crawl_queue = queue.deque([seed_url]) # the URL's that have been seen and at what depth seen = {seed_url: 0} # track how many URL's have been downloaded num_urls = 0 rp = Throttle.get_robots(seed_url) throttle = Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url, headers, proxy=proxy, num_retries=num_retries) links = [] depth = seen[url] if depth != max_depth: # can still crawl further if link_regex: # filter for links matching our regular expression links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print('Blocked by robots.txt:%s'%url) class Throttle: """ Throttle downloading by sleeping between requests to same domain """ def __init__(self, delay): # amount of delay between downloads for each domain self.delay = delay # timestamp of when a domain was last accessed self.domains = {} def wait(self, url): domain = urlparse(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.now() def download(url, headers, proxy, num_retries, data=None): print('Downloading:%s'%url) request = request.Request(url, data, headers) opener = request.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(request.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except request.URLError as e: print('Download error:%s'%e.reason) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries-1, data) else: code =None return html def normalize(seed_url, link): """ Normalize this URL by removing hash and adding domain """ link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates return urlparse.urljoin(seed_url, link) def same_domain(url1, url2): """ Return True if both URL's belong to same domain """ return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc def get_robots(url): """ Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urljoin(url, '/robots.txt')) rp.read() return rp def get_links(html): """ Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
2. it seems we should learn more about "urllib"java
cor@debian:~$ /usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py Traceback (most recent call last): File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 147, in <module> link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 22, in link_crawler rp = Throttle.get_robots(seed_url) File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 133, in get_robots rp.set_url(urlparse.urljoin(url, '/robots.txt')) AttributeError: 'function' object has no attribute 'urljoin'
3. python
TypeError: expected string or bytes-like object
cor@debian:~$ /usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py Downloading:http://example.webscraping.com Downloading--2 Downloading:http://example.webscraping.com Downloading --- 5 Downloading:http://example.webscraping.com Downloading --- 5 Traceback (most recent call last): File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 150, in <module> link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler') File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 36, in link_crawler html = download(url, headers, proxy=proxy, num_retries=num_retries) File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/common.py", line 72, in download5 html = opener.open(requestnew).read().decode('utf-8') File "/usr/lib/python3.5/urllib/request.py", line 466, in open response = self._open(req, data) File "/usr/lib/python3.5/urllib/request.py", line 484, in _open '_open', req) File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain result = func(*args) File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open return self.do_open(http.client.HTTPConnection, req) File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open h.request(req.get_method(), req.selector, req.data, headers) File "/usr/lib/python3.5/http/client.py", line 1107, in request self._send_request(method, url, body, headers) File "/usr/lib/python3.5/http/client.py", line 1147, in _send_request self.putheader(hdr, value) File "/usr/lib/python3.5/http/client.py", line 1083, in putheader if _is_illegal_header_value(values[i]): TypeError: expected string or bytes-like object
4. below is "cpmmon"web
# -*- coding: utf-8 -*- from urllib import request def download1(url): """Simple downloader""" # before #return urllib.urlopen(url).read() #after, using urllib.request instead print('Downloading--1') return request.urlopen(url) from urllib import request def download2(url): """Download function that catches errors""" print('Downloading:%s'%url) print('Downloading--2') try: html = request.urlopen(url).read() except request.URLError as e: print('Download error:%s'%e.reason) html = None return html download2('http://example.webscraping.com') def download3(url, num_retries=2): """Download function that also retries 5XX errors""" print('Downloading:%s'%url) print('Downloading--3') try: html = request.urlopen(url).read() except request.URLError as e: print('Download error:%s'% e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # retry 5XX HTTP errors html = download3(url, num_retries-1) return html def download4(url, user_agent='wswp', num_retries=2): """Download function that includes user agent support""" print('Downloading:%s'%url) print('Downloading--4') headers = {'User-agent': user_agent} requestnew = request.Request(url, headers=headers) try: html = request.urlopen(requestnew).read() except request.URLError as e: print('Download error:%s'%e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # retry 5XX HTTP errors html = download4(url, user_agent, num_retries-1) return html def download5(url, user_agent='wswp', proxy=None, num_retries=2): """Download function with support for proxies""" print('Downloading:%s'%url) print('Downloading --- 5') headers = {'User-agent': user_agent} requestnew = request.Request(url, headers=headers) opener = request.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(request.ProxyHandler(proxy_params)) try: html = opener.open(requestnew).read().decode('utf-8') except request.URLError as e: print('Download error:%s'%e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # retry 5XX HTTP errors html = download5(url, user_agent, proxy, num_retries-1) return html download = download5 if __name__ == '__main__': print(download('http://example.webscraping.com'))
5. express
>>> import urllib.parse >>> import urllib.request >>> url = 'http://www.someserver.com/cgi-bin/register.cgi' >>> user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'\ ... KeyboardInterrupt >>> user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' >>> url 'http://www.someserver.com/cgi-bin/register.cgi' >>> values = {'name': 'Michael Foord', ... 'location': 'Northampton', ... 'language': 'Python' } >>> headers = {'User-Agent': user_agent}
>>> data=urllib.parse.urlencode(values)
>>> data
'location=Northampton&language=Python&name=Michael+Foord'
>>> data = data.encode('ascii')
>>> data
b'location=Northampton&language=Python&name=Michael+Foord'
>>> req = urllib.request.Request(url, data, headers)
>>> with urllib.request.urlopen(req) as response:
... the_page = response.read()
... print(the_page)api
b'<!DOCTYPE html><html><head><meta http-equiv="x-ua-compatible" content="IE=edge"><title></title><script type="text/javascript">(function() {var p = "eyJ1cmkiOiIvY2dpLWJpbi9yZWdpc3Rlci5jZ2kiLCJhcmdzIjoiIiwicmVmZXJlciI6IiJ9:1jHNCg:E4Xczfh7oF8UHMBAouFg0z9KGN8", as = "http://www.someserver.com/mtm/async/", f = "http://www.someserver.com/mtm/direct/";function d(n){window.location.href = "http://www42.someserver.com/"+n;}function ar(r) {if (r.slice(0, 1) !== ".") {try {window.location.assign(r);} catch (err) {}try {var mar = document.createElement("meta");mar.httpEquiv = "refresh";mar.content = "0;url="+r;document.getElementsByTagName("head")[0].appendChild(mar);} catch (err) {}} else {var s = document.createElement("span");s.id="ecode";s.appendChild(document.createTextNode(r.slice(1)));document.getElementsByTagName("body")[0].appendChild(s);}}if ("fetch" in window) {try {fetch(as + p + "/1", {credentials: "include"}).then(function(r) {if (!r.ok) {throw Error("50x");}return r.text();}).then(function(r) {ar(r);});} catch (err) {d(2);}} else {try {var x = new XMLHttpRequest();x.open("GET", as + p + "/2", false);x.onerror = function() {d(3);};x.onload = function() {if (x.status === 200) {ar(x.responseText);} else {d(4);}};x.onreadystatechange = function(r) {if (x.readyState === 4){if (x.status === 200) {ar(x.responseText);} else {d(6);}}};x.send();} catch (err) {d(5);}}})();</script><meta http-equiv="refresh" content="5;url=http://www.someserver.com/mtm/direct/eyJ1cmkiOiIvY2dpLWJpbi9yZWdpc3Rlci5jZ2kiLCJhcmdzIjoiIiwicmVmZXJlciI6IiJ9:1jHNCg:E4Xczfh7oF8UHMBAouFg0z9KGN8/1" /></head><body></body></html>'bash
>>>app