前端時間花了1個月左右,搞了個新聞APP,功能很簡單,就是把頁面版的新聞條目定時爬到後臺數據庫,而後用app顯示出來。javascript
1.客戶端php
使用了DCloud框架,js基本是個新手,從沒寫過像樣的代碼,html5更是新手,索性直接使用現成的前端框架。APPcan,APICloud嘗試過,最終選擇DCloud,話說它的HBuild編輯器確實不錯。 css
貼一部分關鍵代碼: 使用DCloud的下拉刷新方法,使用ajax獲取後臺返回的json列表;html
1 <!DOCTYPE html>
2 <html>
3
4 <head>
5 <meta charset="utf-8">
6 <meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no" />
7 <title></title>
8 <script src="js/mui.min.js"></script>
9 <link href="css/mui.min.css" rel="stylesheet" />
10 <script type="text/javascript" charset="utf-8">
11
//
mui.init();
12
var t;
13 mui.init({
14 pullRefresh: {
15 container: "#pullMine",
//
下拉刷新容器標識,querySelector能定位的css選擇器都可,好比:id、.class等
16
down: {
17 contentdown: "下拉能夠刷新",
//
可選,在下拉可刷新狀態時,下拉刷新控件上顯示的標題內容
18
contentover: "釋放當即刷新",
//
可選,在釋放可刷新狀態時,下拉刷新控件上顯示的標題內容
19
contentrefresh: "正在刷新...",
//
可選,正在刷新狀態時,下拉刷新控件上顯示的標題內容
20
callback: pulldownRefresh
//
必選,刷新函數,根據具體業務來編寫,好比經過ajax從服務器獲取新數據;
21
}
22 }
23 });
24
25 mui.plusReady(
function() {
26 console.log("當前頁面URL:" + plus.webview.currentWebview().getURL());
27 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
28 dataType: 'json',
29 type: 'get',
30 timeout: 10000,
31 success:
function(data) {
32 t=data;
33
var list = document.getElementById("list");
34
var finallist = '';
35
for (i = data.length - 1; i >= 0; i--) {
36 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
37 }
38 list.innerHTML = finallist;
39 console.log("no1"+finallist);
40 mui('#list').on('tap', 'li',
function() {
41 mui.openWindow({
42 url: 'detail_sd.html',
43 id: 'detail_sd',
44 extras: {
45 title: t[
this.getAttribute('data-id')].title,
46 author: t[
this.getAttribute('data-id')].author,
47 pubtime: t[
this.getAttribute('data-id')].pubtime,
48 content: t[
this.getAttribute('data-id')].content
49 }
50 })
51
52 })
53 },
54 error:
function() {}
55 })
56 })
57
58
//
下拉刷新
59
//
60
61
62
63
/*
*
64
* 下拉刷新具體業務實現
65
*/
function pulldownRefresh() {
66 setTimeout(
function() {
67 console.log("refreshing....");
68 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
69 dataType: 'json',
70 type: 'get',
71 timeout: 10000,
72 success:
function(data) {
73 t=data;
74
var list = document.getElementById("list");
75
var finallist = '';
76
for (i = data.length - 1; i >= 0; i--) {
77 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
78
//
finallist=finallist+'<li data-id="'+i+'" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">'+data[i].title+'<p class="mui-ellipsis">'+data[i].content+'</p></div></a></li>';
79
}
80 list.innerHTML = finallist;
81
82
83 },
84 error:
function() {}
85 });
86 mui('#pullMine').pullRefresh().endPulldownToRefresh();
//
refresh completed
87
88 }, 1500);
89 }
90 </script>
91 </head>
92
93 <body>
94
95 <!--<div id="pullMine" class="mui-content mui-scroll-wrapper">
96 <div class="mui-scroll">
97 <ul class="mui-table-view" id="list">
98
99 </ul>
100 </div>
101
102 </div>-->
103
104 <div id="pullMine" class="mui-content mui-scroll-wrapper">
105
106 <div class="mui-scroll">
107 <ul class="mui-table-view" id="list">
108
109 </ul>
110 </div>
111 </div>
112
113 </body>
114
115 </html> 前端
2.後臺PHP發佈端html5
使用了thinkphp框架java
1 <?php
2 namespace Home\Controller;
3
use Think\Controller;
4
class NewsController
extends Controller {
5
public
function getlist(){
6
$newsList=M('news')->order('pubtime asc')->limit(30)->select();
7
echo json_encode(
$newsList);
8 }
9
public
function getlist_sd(){
10
$newsList=M('newssd')->order('pubtime asc')->limit(30)->select();
11
echo json_encode(
$newsList);
12 }
13 ?> web
3.後臺爬蟲ajax
使用了scrapy,爬取新聞內容寫入DBsql
pipelines.py
1
#
-*- coding: utf-8 -*-
2
3
#
Define your item pipelines here
4
#
5
#
Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
#
See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
8
from scrapy
import signals
9
import json
10
import codecs
11
from twisted.enterprise
import adbapi
12
from datetime
import datetime
13
from hashlib
import md5
14
import MySQLdb
15
import MySQLdb.cursors
16
17
class JsonWithEncodingtutorialPipeline(object):
18
def
__init__(self):
19 self.file = codecs.open(
'
qdnews.json
',
'
w
', encoding=
'
utf-8
')
20
def process_item(self, item, spider):
21 line = json.dumps(dict(item), ensure_ascii=False) +
"
\n
"
22 self.file.write(line)
23
return item
24
def spider_closed(self, spider):
25 self.file.close()
26
27
class MySQLStoretutorialPipeline(object):
28
def
__init__(self, dbpool):
29 self.dbpool = dbpool
30
print(
"
-----------init sql proc---
")
31 @classmethod
32
def from_settings(cls, settings):
33 dbargs = dict(
34 host=settings[
'
MYSQL_HOST
'],
35 db=settings[
'
MYSQL_DBNAME
'],
36 user=settings[
'
MYSQL_USER
'],
37 passwd=settings[
'
MYSQL_PASSWD
'],
38 charset=
'
utf8
',
39 cursorclass = MySQLdb.cursors.DictCursor,
40 use_unicode= True,
41 )
42 dbpool = adbapi.ConnectionPool(
'
MySQLdb
', **dbargs)
43
return cls(dbpool)
44
45
#
pipeline默認調用
46
def process_item(self, item, spider):
47 d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
48 d.addErrback(self._handle_error, item, spider)
49 d.addBoth(
lambda _: item)
50
return d
51
#
將每行更新或寫入數據庫中
52
def _do_upinsert(self, conn, item, spider):
53
print (item[
'
link
'][0])
54 linkmd5id = self._get_linkmd5id(item)
55
56
print linkmd5id
57
print(
"
--------------
")
58 now = datetime.now().replace(microsecond=0).isoformat(
'
')
59
#
now=datetime2timestamp(datetime.datetime.now())
60
conn.execute(
"""
61
select 1 from tp_news where linkmd5id = %s
62
""", (linkmd5id, ))
63 ret = conn.fetchone()
64
print (
'
ret=
',ret)
65
66
if ret:
67
print
"
1111111111
"
68 conn.execute(
"""
69
update tp_news set title = %s, content = %s, author = %s,pubtime = %s, pubtime2 = %s,link = %s, updated = %s where linkmd5id = %s
70
""", (item[
'
title
'][0][4:-5], item[
'
content
'][0], item[
'
pubtime
'][0][16:-4],item[
'
pubtime
'][0][-14:-4], item[
'
pubtime
'][0][-14:-4],item[
'
link
'][0], now, linkmd5id))
71
#
print """
72
#
update tp_news_2 set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
73
#
""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
74
else:
75
print
'
2222222222
'
76 conn.execute(
"""
77
insert into tp_news(linkmd5id, title, content, author,link, updated, pubtime, pubtime2)
78
values(%s, %s, %s, %s, %s,%s,%s,%s)
79
""", (linkmd5id, item[
'
title
'][0][4:-5], item[
'
content
'][0], item[
'
pubtime
'][0][16:-4],item[
'
link
'][0], now,item[
'
pubtime
'][0][-14:-4], item[
'
pubtime
'][0][-14:-4]))
80
#
print """
81
#
insert into tp_news_2(linkmd5id, title, description, link, listUrl, updated)
82
#
values(%s, %s, %s, %s, %s, %s)
83
#
""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
84
#
獲取url的md5編碼
85
def _get_linkmd5id(self, item):
86
#
url進行md5處理,爲避免重複採集設計
87
s=md5(item[
'
link
'][0]).hexdigest()
88
#
print (s)
89
#
print(md5(item['link']).hexdigest())
90
return s
91
#
異常處理
92
def _handle_error(self, failue, item, spider):
93 log.err(failure)
items.py
1
#
-*- coding: utf-8 -*-
2
3
#
Define here the models for your scraped items
4
#
5
#
See documentation in:
6
#
http://doc.scrapy.org/en/latest/topics/items.html
7
8
import scrapy
9
10
11
class DmozItem(scrapy.Item):
12
#
define the fields for your item here like:
13
#
name = scrapy.Field()
14
pubtime=scrapy.Field()
15 title=scrapy.Field()
16 link=scrapy.Field()
17 desc=scrapy.Field()
18 content=scrapy.Field()
19 id=scrapy.Field()
spiders.py
1
from scrapy.spider
import BaseSpider
2
from scrapy.selector
import HtmlXPathSelector
3
from tutorial.items
import DmozItem
4
from scrapy.http
import Request
5
from scrapy.utils.response
import get_base_url
6
from scrapy.utils.url
import urljoin_rfc
7
from urllib2
import urlopen
8
from BeautifulSoup
import BeautifulSoup
9
10
from scrapy.spiders
import CrawlSpider
11
from scrapy.loader
import ItemLoader
12
from scrapy.linkextractors.sgml
import SgmlLinkExtractor
13
14
15
import scrapy
16
class DmozSpider(BaseSpider):
17 name =
"
dmoz
"
18 allowed_domains = [
"
dmoz.org
"]
19 start_urls = [
20
"
http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
",
21
"
http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
"
22 ]
23
def parse(self, response):
24
#
filename = response.url.split("/")[-2]
25
#
open(filename, 'wb').write(response.body)
26
hxs=HtmlXPathSelector(response)
27 sites=hxs.select(
'
//ul/li
')
28 items=[]
29
for site
in sites:
30 item=DmozItem()
31 item[
'
title
']=site.select(
'
a/text()
').extract()
32 item[
'
link
']=site.select(
'
a/@href
').extract()
33 item[
'
desc
']=site.select(
'
text()
').extract()
34 items.append(item)
35
return items
36
37
class DmozSpider2(BaseSpider):
38 name =
"
dmoz2
"
39 allowed_domains = [
"
10.60.32.179
"]
40 start_urls = [
41
"
http://10.60.32.179/Site/Site1/myindex.shtml
",
42
#
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
43
]
44
def parse(self, response):
45
#
filename = response.url.split("/")[-2]
46
#
open(filename, 'wb').write(response.body)
47
hxs=HtmlXPathSelector(response)
48 sites=hxs.select(
'
//*[@id="_ctl0_LblContent"]/div/div//ul/li
')
49 items=[]
50
for site
in sites:
51 item=DmozItem()
52 item[
'
date
']=site.select(
'
span/text()
').extract()
53 item[
'
title
']=site.select(
'
a/text()
').extract()
54 item[
'
link
']=site.select(
'
a/@href
').extract()
55 item[
'
desc
']=site.select(
'
text()
').extract()
56 items.append(item)
57
return items
58
59
60
class MySpider(BaseSpider):
61 name =
"
myspider
"
62 allowed_domains = [
"
10.60.32.179
"]
63 start_urls = [
64
'
http://10.60.32.179/Site/Site1/myindex.shtml
',
65
#
'http://example.com/page2',
66
]
67
def parse(self, response):
68
#
collect `item_urls`
69
hxs=HtmlXPathSelector(response)
70 item_urls=hxs.select(
'
//*[@id="_ctl0_LblContent"]/div/div//ul/li
')
71 base_url = get_base_url(response)
72 items=[]
73
for item_url
in item_urls:
74
75
yield Request(url=response.url, callback=self.parse_item,meta={
'
items
': items})
76
77
def parse_item(self, response):
78 hxs=HtmlXPathSelector(response)
79 item_urls=hxs.select(
'
//*[@id="_ctl0_LblContent"]/div/div//ul/li
')
80
81 item=DmozItem()
82 items=response.meta[
'
items
']
83 item[
'
date
']=item_urls.select(
'
span/text()
').extract()
84 item[
'
title
']=item_urls.select(
'
a/text()
').extract()
85 item[
'
link
']=item_urls.select(
'
a/@href
').extract()
86 item[
'
desc
']=item_urls.select(
'
text()
').extract()
87
88
#
item_details_url=item['link']
89
#
populate `item` fields
90
relative_url=item_urls.select(
'
a/@href
').extract()
91
print(relative_url[0])
92 base_url = get_base_url(response)
93 item_details_url=urljoin_rfc(base_url, relative_url[0])
94
yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={
'
item
':item,
'
items
':items})
95
96
def parse_details(self, response):
97
#
item = response.meta['item']
98
#
populate more `item` fields
99
print(
"
***********************In parse_details()***************
")
100 hxs=HtmlXPathSelector(response)
101
print(
"
-------------------------------
")
102
print(response.url)
103 item_detail=hxs.select(
'
/html/body/center/div/div[4]/div[1]/p[1]
').extract()
104
print(
"
________________
",item_detail)
105 item=response.meta[
'
item
']
106 item[
'
detail
']=item_detail
107 items=response.meta[
'
items
']
108 items.append[item]
109
return items
110
111
112
113
114
class DmozSpider3(BaseSpider):
115 name =
"
dmoz3
"
116 allowed_domains = [
"
10.60.32.179
"]
117 start_urls = [
118
'
http://10.60.32.179/Site/Site1/myindex.shtml
',
119 ]
120
121
def parse(self, response):
122 hxs=HtmlXPathSelector(response)
123 sites=hxs.select(
'
//*[@id="_ctl0_LblContent"]/div/div//ul/li
')
124 items=[]
125
for site
in sites:
126 item=DmozItem()
127 item[
'
date
']=site.select(
'
span/text()
').extract()
128 item[
'
title
']=site.select(
'
a/text()
').extract()
129 item[
'
link
']=site.select(
'
a/@href
').extract()
130 item[
'
desc
']=site.select(
'
text()
').extract()
131
132
print(item[
'
link
'][0])
133 base_url = get_base_url(response)
134 relative_url=item[
'
link
'][0]
135 item_details_url=urljoin_rfc(base_url, relative_url)
136
print(
"
*********************
",item_details_url)
137
#
response2=BeautifulSoup(urlopen(item_details_url).read())
138
response2=scrapy.http.Response(item_details_url)
139 hxs2=HtmlXPathSelector(response2)
140 item[
'
detail
']=hxs2.select(
'
/html/body/center/div/div[4]/div[1]/p[1]
').extract()
141
142 items.append(item)
143
return items
144
145
146
147
148
class MySpider5(BaseSpider):
149 name =
"
myspider5
"
150 allowed_domains = [
"
10.60.32.179
"]
151 start_urls = [
152
'
http://10.60.32.179/Site/Site1/myindex.shtml
',
153
#
'http://example.com/page2',
154
]
155
156 items=[]
157 item=DmozItem()
158
159
def parse(self, response):
160
#
collect `item_urls`
161
hxs=HtmlXPathSelector(response)
162 item_urls=hxs.select(
'
//*[@id="_ctl0_LblContent"]/div/div//ul/li
')
163
164
165
166 base_url = get_base_url(response)
167
168
169
for item_url
in item_urls:
170
171
172
173
174 MySpider5.item[
'
date
']=item_url.select(
'
span/text()
').extract()
175 MySpider5.item[
'
title
']=item_url.select(
'
a/text()
').extract()
176 MySpider5.item[
'
link
']=item_url.select(
'
a/@href
').extract()
177 MySpider5.item[
'
desc
']=item_url.select(
'
text()
').extract()
178
179
180
181 relative_url=MySpider5.item[
'
link
']
182
print(relative_url[0])
183 base_url = get_base_url(response)
184 item_details_url=urljoin_rfc(base_url, relative_url[0])
185
print
'
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx=
', str(item_details_url)
186
187
yield Request(url=item_details_url, callback=self.parse_details)
188
189
190
191
192
#
def parse_item(self, response):
193
#
hxs=HtmlXPathSelector(response)
194
#
item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
195
196
197
198
#
# item_details_url=item['link']
199
#
# populate `item` fields
200
#
relative_url=item_urls.select('a/@href').extract()
201
#
print(relative_url[0])
202
#
base_url = get_base_url(response)
203
#
item_details_url=urljoin_rfc(base_url, relative_url[0])
204
#
print 'item urls============================================================='
205
#
yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
206
207
def parse_details(self, response):
208
#
item = response.meta['item']
209
#
populate more `item` fields
210
211
212
print(
"
***********************In parse_details()***************
")
213 hxs=HtmlXPathSelector(response)
214
print(
"
----------------------------------------------------------------
")
215
print(response.url)
216 item_detail=hxs.select(
'
/html/body/center/div/div[4]/div[1]/p[1]
').extract()
217
print(
"
________________
",item_detail)
218
#
item=response.meta['item']
219
#
item['detail']=item_detail
220
221
#
items.append(item)
222
MySpider5.item[
'
detail
']=item_detail
223
#
item['detail']=item_detail
224
225 MySpider5.items.append(MySpider5.item)
226
227
228
229
return MySpider5.item
230
231
232
def parse_details2(self, response):
233
#
item = response.meta['item']
234
#
populate more `item` fields
235
bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
236 url=str(response.url)
237 bbsItem_loader.add_value(
'
title
',item[
'
title
'])
238 abc={
239
'
detail
':
'
/html/body/center/div/div[4]/div[1]/p[1]
'}
240 bbsItem_loader.add_xpath(
'
detail
',abc[
'
detail
'])
241
return bbsItem_loader.load_item()
242
243
244
245
class MySpider6(CrawlSpider):
246 name =
"
myspider6
"
247 allowed_domains = [
"
10.60.32.179
"]
248 start_urls = [
249
'
http://10.60.32.179/Site/Site1/myindex.shtml
',
250
#
'http://example.com/page2',
251
]
252 link_extractor={
253
#
'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
254
#
'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
255
'
page
':SgmlLinkExtractor(allow=
'
/Article/\w+\/\w+\.shtml$
'),
256 }
257
258 _x_query={
259
'
date
':
'
span/text()
',
260
'
date2
':
'
/html/body/center/div/div[4]/p
',
261
'
title
':
'
a/text()
',
262
'
title2
':
'
/html/body/center/div/div[4]/h2
'
263 }
264 _y_query={
265
'
detail
':
'
/html/body/center/div/div[4]/div[1]/p[1]
',
266 }
267
268
def parse(self,response):
269 self.t=0
270
for link
in self.link_extractor[
'
page
'].extract_links(response):
271
yield Request(url=link.url,callback=self.parse_content)
272 self.t=self.t+1
273
274
275
276
def parse_content(self,response):
277 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
278 url=str(response.url)
279 bbsItem_loader.add_value(
'
desc
',url)
280 bbsItem_loader.add_value(
'
link
',url)
281 bbsItem_loader.add_xpath(
'
title
',self._x_query[
'
title2
'])
282 bbsItem_loader.add_xpath(
'
pubtime
',self._x_query[
'
date2
'])
283 bbsItem_loader.add_xpath(
'
content
',self._y_query[
'
detail
'])
284 bbsItem_loader.add_value(
'
id
',self.t)
#
why not useful?
285
return bbsItem_loader.load_item()
286
287
288
class MySpider6SD(CrawlSpider):
289 name =
"
myspider6sd
"
290 allowed_domains = [
"
10.60.7.45
"]
291 start_urls = [
292
'
http://10.60.7.45/SITE_sdyc_WEB/Site1219/index.shtml
',
293
#
'http://example.com/page2',
294
]
295 link_extractor={
296
#
'page':SgmlLinkExtractor(allow='/bbsdoc,board,\w+\.html$'),
297
#
'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,\w+,page,\d+\.html$'),
298
'
page
':SgmlLinkExtractor(allow=
'
/Article/\w+\/\w+\.shtml$
'),
299
#
http://10.60.32.179/Site/Col411/Article/201510/35770_2015_10_29_8058797.shtml
300
#
http://10.60.7.45/SITE_sdyc_WEB/Col1527/Article/201510/sdnw_2110280_2015_10_29_91353216.shtml
301
}
302
303 _x_query={
304
'
date
':
'
span/text()
',
305
'
date2
':
'
/html/body/center/div/div[4]/p
',
306
307
'
title
':
'
a/text()
',
308
#
'title2':'/html/body/center/div/div[4]/h2'
309
'
title2
':
'
/html/body/div[4]/div[1]/div[2]/div[1]/h1[2]/font
'
310
#
'author':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[1]'
311
#
'pubtime2':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[2]'
312
313 }
314 _y_query={
315
#
'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
316
'
detail
':
'
//*[@id="Zoom"]
'
317 }
318
319
def parse(self,response):
320 self.t=0
321
for link
in self.link_extractor[
'
page
'].extract_links(response):
322
yield Request(url=link.url,callback=self.parse_content)
323 self.t=self.t+1
324
325
326
327
def parse_content(self,response):
328 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
329 url=str(response.url)
330 bbsItem_loader.add_value(
'
desc
',url)
331 bbsItem_loader.add_value(
'
link
',url)
332 bbsItem_loader.add_xpath(
'
title
',self._x_query[
'
title2
'])
333 bbsItem_loader.add_xpath(
'
pubtime
',self._x_query[
'
date2
'])
334 bbsItem_loader.add_xpath(
'
content
',self._y_query[
'
detail
'])
335 bbsItem_loader.add_value(
'
id
',self.t)
#
why not useful?
336 return bbsItem_loader.load_item()