【原創】python 豆瓣採集

新手今天剛學python~~~ 有點凌亂~勉強看吧 只能算是給新手看看,見諒html

簡單版本的 豆瓣採集美圖~~~~~~ 美女每天有 有木有~~~python

python 3.4sql

sqlite3fetch

BeautifulSoup 4.4 url

 1 from bs4 import BeautifulSoup
 2 import urllib.request
 3 import time,sched,os
 4 import sqlite3
 5 import sys
 6 
 7 
 8 
 9 #sys.exit()
10 
11 cx = sqlite3.connect('c:\\sqlite\\test.db')
12 global cu
13 cu=cx.cursor()
14 cu.execute('select name from sqlite_master where type=\'table\' order by name;')
15 for ds in cu.fetchall():
16     #print(ds[0])
17     if ds[0] != 'caiji':
18         print("表不存在,開始建立")
19         cu.execute("create table caiji (id INTEGER PRIMARY KEY AUTOINCREMENT,pid integer,nickname text NULL); ")
20     else:
21         print("存在")
22         break
23 #t=('grmlmgjsadf',)
24 #cx.execute("insert into caiji(nickname) values(?)",t)
25 #cx.commit()
26 #cu.execute('select * from caiji where nickname=\''+'grmlmgjsadf'+'\'')
27 #if cu.fetchall():
28 #    print('dsa')
29 
30 
31 #cu.close()
32 #cx.close()
33 
34 path="d:\\imgs\\"
35 ISOTIMEFORMAT='%Y%m%d'
36 
37 
38 def dwonloadimg(uri):
39     temp=time.strftime(ISOTIMEFORMAT, time.localtime())
40     isexists=os.path.exists(path+temp)
41     if not isexists:
42         os.makedirs(path+temp)
43     conn=urllib.request.urlopen(uri)
44     pos=uri.rfind("/")
45     name=uri[pos+1:]
46     f=open(path+temp+'\\'+name,'wb')
47     f.write(conn.read())
48     conn.close()
49     f.close()
50     
51 
52 def Getarticle1(uri):
53     res=urllib.request.urlopen(uri)
54     html=res.read()
55     res.close()
56     str(html,'utf-8')
57     bs=BeautifulSoup(html)
58     imgs=bs.find_all('div',class_="topic-figure cc")
59     for s in imgs:
60         strc=s.find('img').attrs['src']
61         print('圖片:',strc)
62         dwonloadimg(strc)
63 def init():
64     print('開始抓取')
65     url="http://www.douban.com/group/haixiuzu/"
66     temp=urllib.request.urlopen(url)
67     html=temp.read()
68     str(html,'utf-8')
69     bs=BeautifulSoup(html)
70     divs=bs.find_all('td',class_='title')
71     for s in divs:
72         uri=s.a["href"]
73         #print(s.a.string,"\n",uri)
74         cu.execute('select * from caiji where nickname=\''+uri+'\'')
75         global cu
76         if not cu.fetchall():
77             print("新文章")
78             cx.execute("insert into caiji(nickname) values(?)",(uri,))
79             cx.commit()
80             Getarticle1(uri)
81        # else:
82         #    print("文章存在")
83     print("結束")
84 
85 
86 while True:
87     init()
88     time.sleep(60)
相關文章
相關標籤/搜索