Web Crawlinghtml
Index Buildingnode
Searchingpython
A Web crawler is a computer program that browses the World Wide Web in a methodical, automated manner. Web crawlers are mainly used to create a copy of all the visited pages for later processing by a search engine that will index the downloaded pages to provide fast searches.web
A way for a web site to communicate with web crawlerssql
An informal and voluntary standardjson
It tells the crawler where to look and where not to lookapp
Search engine indexing collects, parses, and stores data to facilitate fast and accurate information retrieval. The purpose of storing an index is to optimize speed and performance in finding relevant documents for a search query. Without an index, the search engine would scan every document in the corpus, which would require considerable time and computing power.ide
import sqlite3 import urllib.error import ssl from urllib.parse import urljoin from urllib.parse import urlparse from urllib.request import urlopen from bs4 import BeautifulSoup # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE # Link to sqlite conn = sqlite3.connect('spider.sqlite') cur = conn.cursor() # Create new tables cur.execute('''CREATE TABLE IF NOT EXISTS Pages (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, error INTEGER, old_rank REAL, new_rank REAL)''') cur.execute('''CREATE TABLE IF NOT EXISTS Links (from_id INTEGER, to_id INTEGER)''') #This table store only one url which is processing cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') # Check to see if we are already in progress... cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') row = cur.fetchone() if row is not None: print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") else : starturl = input('Enter web url or enter: ') if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' # delete the "/" if ( starturl.endswith('/') ) : starturl = starturl[:-1] web = starturl if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : pos = starturl.rfind('/') web = starturl[:pos] if ( len(web) > 1 ) : cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) conn.commit() # Get the current webs cur.execute('''SELECT url FROM Webs''') webs = list() for row in cur: webs.append(str(row[0])) print(webs) many = 0 while True: if ( many < 1 ) : sval = input('How many pages:') if ( len(sval) < 1 ) : break many = int(sval) many = many - 1 cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') try: row = cur.fetchone() # print row fromid = row[0] url = row[1] except: print('No unretrieved HTML pages found') many = 0 break print(fromid, url, end=' ') # If we are retrieving this page, there should be no links from it cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) try: document = urlopen(url, context=ctx) html = document.read() if document.getcode() != 200 : print("Error on page: ",document.getcode()) cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) if 'text/html' != document.info().get_content_type() : print("Ignore non text/html page") cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) conn.commit() continue print('('+str(len(html))+')', end=' ') soup = BeautifulSoup(html, "html.parser") except KeyboardInterrupt: print('') print('Program interrupted by user...') break except: print("Unable to retrieve or parse page") cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) conn.commit() continue cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) conn.commit() # Retrieve all of the anchor tags tags = soup('a') count = 0 for tag in tags: href = tag.get('href', None) if ( href is None ) : continue # Resolve relative references like href="/contact" up = urlparse(href) if ( len(up.scheme) < 1 ) : href = urljoin(url, href) ipos = href.find('#') if ( ipos > 1 ) : href = href[:ipos] if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue if ( href.endswith('/') ) : href = href[:-1] # print href if ( len(href) < 1 ) : continue # Check if the URL is in any of the webs found = False for web in webs: if ( href.startswith(web) ) : found = True break if not found : continue cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) count = count + 1 conn.commit() cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) try: row = cur.fetchone() toid = row[0] except: print('Could not retrieve id') continue # print fromid, toid cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) print(count) cur.close()
import sqlite3 conn = sqlite3.connect('spider.sqlite') cur = conn.cursor() # Find the ids that send out page rank - we only are interested # in pages in the SCC that have in and out links cur.execute('''SELECT DISTINCT from_id FROM Links''') from_ids = list() for row in cur: from_ids.append(row[0]) # Find the ids that receive page rank to_ids = list() links = list() cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') for row in cur: from_id = row[0] to_id = row[1] if from_id == to_id : continue if from_id not in from_ids : continue if to_id not in from_ids : continue links.append(row) if to_id not in to_ids : to_ids.append(to_id) # Get latest page ranks for strongly connected component prev_ranks = dict() for node in from_ids: cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) row = cur.fetchone() prev_ranks[node] = row[0] sval = input('How many iterations:') many = 1 if ( len(sval) > 0 ) : many = int(sval) # Sanity check if len(prev_ranks) < 1 : print("Nothing to page rank. Check data.") quit() # Lets do Page Rank in memory so it is really fast for i in range(many): # print prev_ranks.items()[:5] next_ranks = dict(); total = 0.0 for (node, old_rank) in list(prev_ranks.items()): total = total + old_rank next_ranks[node] = 0.0 # print total # Find the number of outbound links and sent the page rank down each for (node, old_rank) in list(prev_ranks.items()): # print node, old_rank give_ids = list() for (from_id, to_id) in links: if from_id != node : continue # print ' ',from_id,to_id if to_id not in to_ids: continue give_ids.append(to_id) if ( len(give_ids) < 1 ) : continue amount = old_rank / len(give_ids) # print node, old_rank,amount, give_ids for id in give_ids: next_ranks[id] = next_ranks[id] + amount newtot = 0 for (node, next_rank) in list(next_ranks.items()): newtot = newtot + next_rank evap = (total - newtot) / len(next_ranks) # print newtot, evap for node in next_ranks: next_ranks[node] = next_ranks[node] + evap newtot = 0 for (node, next_rank) in list(next_ranks.items()): newtot = newtot + next_rank # Compute the per-page average change from old rank to new rank # As indication of convergence of the algorithm totdiff = 0 for (node, old_rank) in list(prev_ranks.items()): new_rank = next_ranks[node] diff = abs(old_rank-new_rank) totdiff = totdiff + diff avediff = totdiff / len(prev_ranks) print(i+1, avediff) # rotate prev_ranks = next_ranks # Put the final ranks back into the database print(list(next_ranks.items())[:5]) cur.execute('''UPDATE Pages SET old_rank=new_rank''') for (id, new_rank) in list(next_ranks.items()) : cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) conn.commit() cur.close()
import sqlite3 conn = sqlite3.connect('spider.sqlite') cur = conn.cursor() cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url FROM Pages JOIN Links ON Pages.id = Links.to_id WHERE html IS NOT NULL GROUP BY id ORDER BY inbound DESC''') count = 0 for row in cur : if count < 50 : print(row) count = count + 1 print(count, 'rows.') cur.close()
import sqlite3 conn = sqlite3.connect('spider.sqlite') cur = conn.cursor() print("Creating JSON output on spider.js...") howmany = int(input("How many nodes? ")) cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url FROM Pages JOIN Links ON Pages.id = Links.to_id WHERE html IS NOT NULL AND ERROR IS NULL GROUP BY id ORDER BY id,inbound''') fhand = open('spider.js','w') nodes = list() maxrank = None minrank = None for row in cur : nodes.append(row) rank = row[2] if maxrank is None or maxrank < rank: maxrank = rank if minrank is None or minrank > rank : minrank = rank if len(nodes) > howmany : break if maxrank == minrank or maxrank is None or minrank is None: print("Error - please run sprank.py to compute page rank") quit() fhand.write('spiderJson = {"nodes":[\n') count = 0 map = dict() ranks = dict() for row in nodes : if count > 0 : fhand.write(',\n') # print row rank = row[2] rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') map[row[3]] = count ranks[row[3]] = rank count = count + 1 fhand.write('],\n') cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') fhand.write('"links":[\n') count = 0 for row in cur : # print row if row[0] not in map or row[1] not in map : continue if count > 0 : fhand.write(',\n') rank = ranks[row[0]] srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') count = count + 1 fhand.write(']};') fhand.close() cur.close() print("Open force.html in a browser to view the visualization")