第一次用python 寫的簡單爬蟲 記錄在本身的博客

#python.py
from bs4 import BeautifulSoup
import urllib.request
from MySqlite import MySqlite

global g_intid
g_intid=0
def GetBlogTileAndName(url):
    res = urllib.request.urlopen(url)
    html = res.read()
    res.close()
    str(html, 'utf-8')
    soup=BeautifulSoup(html)
    divs=soup.find_all(attrs={"class":"postTitle"})

    for divname in divs:
        print("title:=",divname.a.string,"href:=",divname.a["href"])
        global  g_intid
        g_intid+=1
        x=MySqlite()
        x.InsertDate(g_intid,divname.a["href"],divname.a.string)
def GetBlogPage(url):
    res = urllib.request.urlopen(url)
    html = res.read()
    res.close()
    str(html, 'utf-8')
    soup=BeautifulSoup(html)
    divPager=soup.find(attrs={"class":"pager"})
    print(divPager.string)


for i in range(1,8) :
 url=r"http://www.cnblogs.com/FCoding/default.html?page="+str(i)
 GetBlogTileAndName(url)html

  

#MySqlite.py

class MySqlite(object):
    """description of class"""
    def __init__(self, *args):
        return super().__init__(*args)
    def callstr(self,str):
        print(str)

    def InsertDate(self,id,url,title):
        conn = sqlite3.connect(r"d:\123.db")
        c=conn.cursor()
        #try:
        #    c.execute('create table blog (ID intergeer,url text,title text , PRIMARY KEY(ID))')
        #except ValueError:
        #    print("error My")
        strExe="insert into blog values ({0}, \"{1}\",\"{2}\")".format(id,url,title)
        print(id)
        #c.execute('insert into blog values (last_insert_rowid(),url,title)')
        c.execute(strExe)
        conn.commit()
        c.close()
        conn.close()

    def GetDate(self):
        import sqlite3
        conn = sqlite3.connect(r"d:\123.db")
        c=conn.cursor()
        res=c.execute("select count(*) from blog")
        res=c.fetchone()
        print(res[0])
        data=c.execute("select * from blog")
        for item in data:
            for ite in item:
                print(ite)
        conn.commit()
        c.close()
        conn.close()

 簡述一下功能:python

經過urllib 下載網頁 使用BeautifulSoup 解析sql

調用find_all(attrs={"class":"postTitle"}) 數據庫

找到HTML 中全部class=posttitle 的tagpost

而後遍歷 取出title 和href 保存到數據庫中fetch

 

此程序 無容錯。新手無笑!url

相關文章
相關標籤/搜索