huputitle_spiders.pypython
#coding:utf-8 import scrapy from huputitle.items import HuputitleItem from scrapy.crawler import CrawlerProcess class hupuSpider(scrapy.Spider): name = 'huputitle' allowed_domains = ["bbs.hupu.com"] start_urls = ["https://bbs.hupu.com/bxj"] def parse(self, response): item = HuputitleItem() item['titles'] = response.xpath('//a[@id=""]/text()').extract()#提取標題 # print 'titles',item['titles'] yield item new_url = "https://bbs.hupu.com" + response.xpath('//a[@id="j_next"]/@href').extract_first() if new_url: yield scrapy.Request(new_url,callback=self.parse)
items.pydom
# -*- coding: utf-8 -*- import scrapy class HuputitleItem(scrapy.Item): # define the fields for your item here like: titles = scrapy.Field()
pipelines.pyscrapy
# -*- coding: utf-8 -*- import os import urllib from huputitle import settings import sys reload(sys) sys.setdefaultencoding( "utf-8" ) class HuputitlePipeline(object): def process_item(self, item, spider): for title in item['titles']: # print 'title',title fo = open("foo.txt", "a") fo.write("".join(title)+"\r\n") fo.close() return item
settings.pyide
BOT_NAME = 'huputitle' SPIDER_MODULES = ['huputitle.spiders'] NEWSPIDER_MODULE = 'huputitle.spiders' ITEM_PIPELINES = { 'huputitle.pipelines.HuputitlePipeline': 1, } USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
最終爬取了100頁2W多個標題url
這裏我使用了 jieba 這個庫來分詞
hupudivide.pyspa
#encoding=utf-8 import jieba import sys reload(sys) sys.setdefaultencoding('utf-8') fo = open("hupu.txt", "r") fi = open("hupudi.txt", "w") lines = fo.readlines() for line in lines: seg_list = jieba.cut_for_search(line) fi.write(" \n".join(seg_list))
分出了17w個詞
而後統計數量
huPuCounter.pycode
#encoding=utf-8 import jieba import jieba.analyse import time from collections import Counter import sys reload(sys) sys.setdefaultencoding('utf-8') fo = open("hupudi.txt", "r") fi = open("hupunum.txt", "w") fl = open("hupunumword.txt", "w") f = open("hupuword.txt", "w") lines = fo.readlines() d = {} for line in lines: if line not in d: d[line] = 1 else: d[line] = d[line] + 1 d = sorted(d.items(),key=lambda item:item[1],reverse=True) for k in d: fi.write("%s%d\n" % (k[0][:-1].encode('utf-8'),k[1])) if len(k[0][:-1].encode('utf-8')) >= 6: fl.write("%s%d\n" % (k[0][:-1].encode('utf-8'),k[1])) f.write("%s" % (k[0][:-1].encode('utf-8')))
這裏我統計了兩個詞如下和兩個詞以上的詞的量分配如圖圖片
makeHupuCloud.pyip
#encoding=utf-8 import matplotlib.pyplot as plt from wordcloud import WordCloud import jieba text_from_file_with_apath = open('foo.txt').read() wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = False) wl_space_split = " ".join(wordlist_after_jieba) backgroud_Image = plt.imread('huputag.jpg') my_wordcloud = WordCloud(background_color = 'white', mask = backgroud_Image, ).generate(wl_space_split) plt.imshow(my_wordcloud) plt.axis("off") plt.show()
這裏我是用python的wordcloud庫生成的詞雲,圖片是hupu的logo
使用jieba的分詞分出詞性 生成的圖表utf-8