python中文亂碼

需求:爬取豆瓣上top250電影信息python

結構:title,type,summry,url,rateweb

from bs4 import BeautifulSoup
import requests

url = 'https://movie.douban.com/top250'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
urls = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a')
titles = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-of-type(1)')
rates = soup.select('#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num')


list = []
data = None
for title,url,rate in zip(titles,urls,rates):
        data = {
           'title'  :title.get_text(),
           'url'    :url.get('href'),
           'rate'   :rate.get_text(),
           'type' : None,
           'summry' : None
           }
        list.append(data)
for item in list:
    url = item['url']
    item_wb_data = requests.get(url)
    soup = BeautifulSoup(item_wb_data.text,'lxml')
    infos = soup.select('#info > span')
    summry =  soup.select('#link-report > span[property="v:summary"]')
    flag = 0
    movietype = []
    for info in infos:
        if info.get_text() =='類型:':
            flag = 1
            continue
        if info.get_text() =='製片國家/地區:':
            flag = 0
            break
        if flag == 1:
           movietype.append(info.get_text()); 
    item['type'] = str(movietype)
    if len(summry) > 0:
        item['summry'] = str(summry[0].get_text())
'''       
for each_item in list:
        print("title : " + each_item['title'])
        print("type : " + each_item['type'])
        print("summry : " + each_item['summry'])
        print("url : " + each_item['url'])
        print("rate : " + each_item['rate'])'''

with open('webData.txt','w') as outFile:
    for each_item in list:
        print("title : " + each_item['title'],file = outFile)
        print("type : " + each_item['type'],file = outFile)
        if each_item['summry'] == None:
            print("summry : " ,file = outFile)
        else:
            print("summry : " + str(each_item['summry'].encode('GBK', 'ignore')),file = outFile)

print("url : " + each_item['url'],file = outFile) print("rate : " + each_item['rate'],file = outFile)

 紅色部分是修改過的,原來代碼爲app

print("summry : " + each_item['summry'],file = outFile)編碼

報錯內容:url

Traceback (most recent call last):
File "testDemo.py", line 58, in <module>
print("summry : " + each_item['summry']),file = outFile)
UnicodeEncodeError: 'gbk' codec can't encode character '\ufc3a' in position 43: illegal multibyte sequencespa

查過不少資料,其中比較有效的方式,則是忽略gbk編碼時不能識別的符號,則爲紅色標註部分代碼,其參考資料來源爲:code

http://www.crifan.com/unicodeencodeerror_gbk_codec_can_not_encode_character_in_position_illegal_multibyte_sequence/xml

根據此方式雖然在python運行過程當中不在報錯,可是獲得的內容,在文件中直接打開仍是亂碼。做以下修改:blog

with open('webData.txt','w') as outFile:
    for each_item in list:
        print("title : " + each_item['title'],file = outFile)
        print("type : " + each_item['type'],file = outFile)
        if each_item['summry'] == None:
            print("summry : " ,file = outFile)
        else:
            print("summry : " + each_item['summry'].encode('gbk','ignore').decode('gbk','ignore'),file = outFile) print("url : " + each_item['url'],file = outFile)
        print("rate : " + each_item['rate'],file = outFile)

搞定!ip

 

其餘方式解決,參考:

http://jerrypeng.me/2014/02/python-2-unicode-print-pitfall/

代碼以下:

sys.stdout = io.TextIOWrapper(sys.stdout,encoding='utf8')

將 Python 使用系統默認的編碼設置爲utf-8,運行時仍然報錯。

UnicodeEncodeError: 'gbk' codec can't encode character '\u2022' in position 190: illegal multibyte sequence

with open('webData.txt','w') as outFile:    for each_item in list:        print("title : " + each_item['title'],file = outFile)        print("type : " + each_item['type'],file = outFile)        if each_item['summry'] == None:            print("summry : " ,file = outFile)        else:            print("summry : " + each_item['summry'].encode('gbk','ignore').decode('gbk','ignore'),file = outFile)        print("url : " + each_item['url'],file = outFile)        print("rate : " + each_item['rate'],file = outFile)

相關文章
相關標籤/搜索