爬取美食傑

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import json
import sys
import re
from bs4 import BeautifulSoup
import pymysql
from lxml import etree


reload(sys)
sys.setdefaultencoding( "utf-8" )

img_base_path='/Users/chanming/Desktop/msj/img/'

db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'meishij', charset="utf8")
cursor = db.cursor()

#校驗菜餚是否被拉取
check_dish_sql = " SELECT COUNT(id) FROM dish where NAME='{}' "
#插入菜餚
dish_insert_sql = "INSERT INTO `meishij`.`dish`( `category_first`, `category_second`, `name`, `url`, `comment`, `tec`, `difficulty`, `num`, `taste`, `prepare_time`, `cook_time`, `skills`, `origin_url`,`finished_url`,`local_finished_url`,`msj_url`) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s,%s)"

#插入作法
cook_dish_insert_sql = "INSERT INTO `meishij`.`cook_dish`( `dish_name`, `step_num`, `step_name`, `url`, `origin_url`) VALUES ( %s, %s, %s, %s, %s)"

dish_food_insert_sql = "INSERT INTO `meishij`.`dish_food`( `type`, `food_name`, `num`, `dish_name`) VALUES ( %s, %s, %s, %s)"



def crawl():
    start_url='https://www.meishij.net/china-food/caixi/'
    r=requests.get(start_url,verify=False)
    soup = BeautifulSoup(r.text,'lxml')
    category1List = soup.find_all("dl",class_='listnav_dl_style1')
    for category1 in category1List:
        #category1=category1List[0]
        #一級分類
        category1name=category1.find('dt').get_text()
        aList=category1.find_all('a')
        #二級分類
        # arrs=['川菜', '湘菜', '粵菜', '東北菜', '魯菜', '浙菜', '蘇菜', '清真菜', '閩菜', '滬菜', '京菜', '湖北菜', '徽菜', '豫菜', '西北菜', '雲貴菜', '江西菜', '山西菜']
        for atag in aList:
            category2name=atag.get_text()
            # if category2name in arrs:
            #     continue
            if category2name !='其它菜':
                continue
            print "%s" % category1name,'%s' % category2name

            r_category=requests.get(atag['href'])
            soup_category=BeautifulSoup(r_category.text,'html.parser')
            page_total_dom=soup_category.select_one('input[name="lm"]')
            start_index=1
            if page_total_dom:
                page_text=page_total_dom.get_text()
                page_text_arr=re.findall(r'\d+',page_text)
            else :
                page_text_arr=[1]
            for pageIndex in range(56,int(page_text_arr[0])+1):
                page_url = atag['href']+'?&page='+str(pageIndex)
                print page_url
                r_page=requests.get(page_url)
                soup_page = BeautifulSoup(bs_preprocess(r_page.text),'lxml')
                dish_list = soup_page.select('div[class="listtyle1"]')
                for dish in dish_list:
                    a_dish=dish.find('a');
                    r_dish=requests.get( a_dish['href'],verify=False);
                    #r_dish=requests.get('http://www.meishij.net/zuofa/baochaopangxie.html');
                    soup_dish = BeautifulSoup(bs_preprocess(r_dish.text),'lxml')
                    print '\r\n'
                    print a_dish['href']
                    try:
                        name=soup_dish.select_one('a[id="tongji_title"]').get_text()
                    except Exception as e:
                        print '廣告下一條'
                        continue
                    print("%s " % name)
                    #校驗菜餚是否已被拉取
                    #name = 'liyuhang'
                    check_dish_sql_name = check_dish_sql.format(name)
                    cursor.execute(check_dish_sql_name)
                    count = cursor.fetchone()
                    if count[0]>=1:
                        continue
                    
                    db_dish={}
                    db_dish['msj_url']=a_dish['href']
                    db_dish['category_first']=category1name
                    db_dish['category_second']=category2name
                    db_dish['name'] = name

                    li_list=[]
                    li_sig=soup_dish.find('li',class_='w127')
                    li_list.append(li_sig)
                    li_sigList=li_sig.find_next_siblings('li')
                    li_list=li_list+li_sigList

                    #工藝
                    a_tec = li_list[0].find('a');
                    if a_tec:
                        tec = a_tec.get_text()
                    else:
                        tec = li_list[0].find('div').get_text()
                    #難度
                    a_difficulty = li_list[1].find('a')
                    if a_difficulty:
                        difficulty = a_difficulty.get_text()
                    else:
                        difficulty = li_list[1].find('div').get_text()
                    
                    #人數
                    a_num = li_list[2].find('a')
                    if a_num:
                        num = a_num.get_text()
                    else:
                        num = li_list[2].find('div').get_text()
                    #口味
                    a_taste = li_list[3].find('a')
                    if a_taste:
                        taste = a_taste.get_text()
                    else:
                        taste = li_list[3].find('div').get_text()
                    #準備時間
                    a_prepare_time = li_list[4].find('a')
                    if a_prepare_time:
                        prepare_time = a_prepare_time.get_text()
                    else:
                        prepare_time = li_list[4].find('div').get_text()
                    #烹飪時間
                    a_cook_time=li_list[5].find('a')
                    if a_cook_time:
                        cook_time = a_cook_time.get_text()
                    else:
                        cook_time = li_list[5].find('div').get_text()

                    swiper_list=soup_dish.find_all('div',class_='swiper-slide');
                    #swiper_list=swiper_list[1:-1]
                    furl_list=[]
                    flurl_list=[]
                    for swiper in swiper_list:
                        furl=swiper.find('img')['src']
                        fsurl=furl[furl.rfind('/')+1:]
                        f=open(img_base_path+fsurl,'wb')
                        img=requests.get(furl)
                        f.write(img.content)
                        f.close()
                        furl_list.append(furl)
                        flurl_list.append(fsurl)

                        
                    db_dish['tec']=tec
                    db_dish['difficulty']=difficulty
                    db_dish['num']=num
                    db_dish['taste']=taste
                    db_dish['prepare_time']=prepare_time
                    db_dish['cook_time']=cook_time
                    db_dish['finished_url']=",".join(s for s in furl_list)
                    db_dish['local_finished_url']=",".join(s for s in flurl_list )

                    if soup_dish.find('div',class_='materials'):
                        comment_p=soup_dish.find('div',class_='materials').find('p')
                        if comment_p:
                            db_dish['comment']=comment_p.get_text()
                        else:
                            db_dish['comment']=''
                    else:
                        db_dish['comment']=''

                    origin_url=soup_dish.find('div',class_='cp_headerimg_w').find('img')['src']
                    db_dish['origin_url']=origin_url
                    index=origin_url.rfind('/');
                    url=origin_url[index+1:]
                    db_dish['url'] = url

                    #下載圖片
                    try:
                        f=open(img_base_path+url,'wb')
                        img=requests.get(origin_url)
                        f.write(img.content)
                        f.close()                    
                    except Exception  as e:
                        pass

                    liFlag=1
                    step_flag=soup_dish.find('em',class_='step')
                    if step_flag and step_flag.parent.name=='p':
                        liFlag=2
                    pp=soup_dish.select('em.step + img')
                    if pp:
                        liFlag=3

                    h2_title=soup_dish.find('h2',text='烹飪技巧')
                    try:
                        if h2_title:
                            if liFlag==2:
                                p_list=h2_title.find_next_siblings('p')
                                skill_list=[]
                                for p_skill in p_list:
                                    if p_skill.get_text()=='':
                                        continue
                                    skill_list.append(p_skill.get_text())
                                skills="|".join(s for s in skill_list)
                            else:
                                skills=h2_title.find_next_sibling('p').get_text()
                        else:
                            skills=''
                    except Exception as e:
                        skills=''

                    db_dish['skills'] = skills
                    cursor.execute(dish_insert_sql,(db_dish['category_first'],db_dish['category_second'],db_dish['name'],db_dish['url'],db_dish['comment'],db_dish['tec'],db_dish['difficulty'],db_dish['num'],db_dish['taste'],db_dish['prepare_time'],db_dish['cook_time'],db_dish['skills'],db_dish['origin_url'],db_dish['finished_url'],db_dish['local_finished_url'],db_dish['msj_url']))
                    db.commit()

                    if liFlag==2:
                        step_list=soup_dish.find_all('em',class_='step')
                        for step in step_list:
                            cook_dish={}
                            cook_dish['step_name']=step.parent.get_text()
                            cook_dish['step_num']=cook_dish['step_name'][0:cook_dish['step_name'].find('.')]
                            cook_dish['dish_name']=db_dish['name']
                            img_p=step.find_parent().find_next_sibling('p')
                            if img_p:
                                step_img=img_p.find('img')
                                if step_img:
                                    cook_dish['origin_url']=step_img['src']
                                    cook_dish['url']=cook_dish['origin_url'][cook_dish['origin_url'].rfind('/')+1:]
                                    f=open(img_base_path+cook_dish['url'],'wb')
                                    img=requests.get(cook_dish['origin_url'])
                                    f.write(img.content)
                                    f.close()
                                else:
                                    cook_dish['origin_url']=''
                                    cook_dish['url']=''
                            else:
                                cook_dish['origin_url']=''
                                cook_dish['url']=''

                            cursor.execute(cook_dish_insert_sql,(cook_dish['dish_name'],cook_dish['step_num'],cook_dish['step_name'],cook_dish['url'],cook_dish['origin_url']))
                            db.commit()
                    elif liFlag==3:
                        step_list=soup_dish.find_all('em',class_='step')
                        for step in step_list:
                            cook_dish={}
                            cook_dish['step_name']=step.parent.get_text()
                            cook_dish['step_num']=cook_dish['step_name'][0:cook_dish['step_name'].find('.')]
                            cook_dish['dish_name']=db_dish['name'] 
                            step_img=step.find_next_sibling('img')
                            if step_img:
                                cook_dish['origin_url']=step_img['src']
                                cook_dish['url']=cook_dish['origin_url'][cook_dish['origin_url'].rfind('/')+1:]
                                f=open(img_base_path+cook_dish['url'],'wb')
                                img=requests.get(cook_dish['origin_url'])
                                f.write(img.content)
                                f.close()
                            else :
                                cook_dish['origin_url']=''
                                cook_dish['url']=''
                            cursor.execute(cook_dish_insert_sql,(cook_dish['dish_name'],cook_dish['step_num'],cook_dish['step_name'],cook_dish['url'],cook_dish['origin_url']))
                            db.commit()                            
                        pass
                    else:
                        step_list=soup_dish.find_all('div',class_='content clearfix')
                        for step in step_list:
                            cook_dish={}
                            cook_dish['step_name']=step.get_text()
                            cook_dish['step_num']=cook_dish['step_name'][0:cook_dish['step_name'].find('.')]
                            cook_dish['dish_name']=db_dish['name']

                            try:
                                cook_dish['origin_url']=step.find('img')['src']
                                cook_dish['url']=cook_dish['origin_url'][cook_dish['origin_url'].rfind('/')+1:]
                                #下載圖片
                                f=open(img_base_path+cook_dish['url'],'wb')
                                img=requests.get(cook_dish['origin_url'])
                                f.write(img.content)
                                f.close()
                            except Exception as e:
                                cook_dish['origin_url']=''
                                cook_dish['url']=''

                            cursor.execute(cook_dish_insert_sql,(cook_dish['dish_name'],cook_dish['step_num'],cook_dish['step_name'],cook_dish['url'],cook_dish['origin_url']))
                            db.commit()

                    #用料
                    zl_div=soup_dish.find('div',class_="yl zl clearfix")
                    if zl_div:
                        li_list=zl_div.find_all('li')
                        for zl_li in li_list:
                            dish_food={}
                            dish_food['type']='主料'
                            h4_d=zl_li.find('h4');
                            dish_food['food_name']=h4_d.find('a').get_text()
                            dish_food['num']=h4_d.find('span').get_text()
                            dish_food['dish_name']=db_dish['name']
                            cursor.execute(dish_food_insert_sql,(dish_food['type'],dish_food['food_name'],dish_food['num'],dish_food['dish_name']))
                            print '主料:'+dish_food['food_name']
                            db.commit()

                    #輔料
                    fl_div_list=soup_dish.find_all('div',class_="yl fuliao clearfix")
                    if fl_div_list:
                        for fl_div in fl_div_list:
                            a_label=fl_div.select_one('h3 a')
                            fl_list = fl_div.find_all('li')
                            for fl_li in fl_list:
                                dish_food={}
                                dish_food['type']=a_label.get_text()
                                dish_food['dish_name']=db_dish['name']
                                dish_food['food_name']=fl_li.find('h4').find('a').get_text()
                                dish_food['num']=fl_li.find('h4').find_next_sibling('span').get_text()
                                cursor.execute(dish_food_insert_sql,(dish_food['type'],dish_food['food_name'],dish_food['num'],dish_food['dish_name']))
                                db.commit()
                                print dish_food['type']+':'+dish_food['food_name']



        




def bs_preprocess(html):
     """remove distracting whitespaces and newline characters"""
     pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
     html = re.sub(pat, '', html)       # remove leading and trailing whitespaces
     html = re.sub('\n', ' ', html)     # convert newlines to spaces
                                        # this preserves newline delimiters
     html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
     html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
     return html

crawl()
相關文章
相關標籤/搜索