簡單抓取安居客房產數據,並保存到Oracle數據庫

思路和上一篇差很少,先獲取網站html文件,使用BeautifulSoup進行解析,將對應屬性取出,逐一處理,最後把整理出的記錄保存到oracle中,持久化儲存。html

'''
Created on 2017年2月20日web

@author: Administrator
'''
from urllib import parse, request
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from datetime import *sql

import numpy as np
import pandas as pd
import time
import re
import socket
import traceback
import logging數據庫

def get_page(url):
    headers = {
        'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Referer': r'http://jinan.anjuke.com/sale/b151-m161-o5-p1/',
        'Host': r'jinan.anjuke.com',
        'Connection': 'keep-alive'
    }
    timeout = 60
    socket.setdefaulttimeout(timeout)  # 設置超時
    req = request.Request(url, headers=headers)
    response = request.urlopen(req).read()
    page = response.decode('utf-8','ignore')
    return page
if __name__ == '__main__':  
   
    curDate = date.strftime(date.today(),'%Y%m%d',)
    logName =  'Anjuke_%s.log' %curDate
    logging.basicConfig(level=logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename=logName,
                filemode='a')
   
    url = 'http://jinan.anjuke.com/sale/b151-m161-o5-p1/?from_price=150&to_price=250&from_area=120&to_area=200'
    html = get_page(url)
    soup =  BeautifulSoup(html,"lxml")
    table =soup.find_all('li','list-item')
   
    df = pd.DataFrame(columns=["address","floor","house_name","href","m2","price","room","unit_price","web","year","op_time"])
   
    for tr in table:
        #名稱
        str_name = tr.find("div","house-title").find('a').string.strip()
        ##鏈接
        str_href = tr.find("a","houseListTitle")["href"]
       
        ##房產屬性
        str_ts = list()
        for s in tr.find("div","details-item").find_all('span'):           
            str_ts.append(s.string)
        room = str_ts[0]
        m2 =re.findall(r"(\d+\.*\d+)",str_ts[1])
        floor = str_ts[2]
        year = str_ts[3]
       
        ##地址信息
        str_add = tr.find("span","comm-address").string.strip()
        str_add = re.sub(r"(\xa0\xa0\n)","",str_add)
        ##價格
        str_price = tr.find("div","pro-price").find('span','price-det')
        str_price = re.findall(r"(\d+\.*\d+)",str_price.text)
        str_unit_price = re.findall(r"(\d+\.*\d+)",tr.find("div","pro-price").find('span','unit-price').text)
       
        row = {'web':'安居客','house_name':str_name,'room':room,'m2':m2,'price':str_price,'unit_price':str_unit_price,'floor':floor,'year':year,'address':str_add,'href':str_href}
        #print(row)
        newrow = pd.DataFrame(data=row,index=["0"])
        df=df.append(newrow,ignore_index=True)
    #df.reset_index(drop = True)
    df["op_time"]=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    df['m2'] = df['m2'].astype('int')
    df['price'] = df['price'].astype('int')
    df['unit_price'] = df['unit_price'].astype('int')
   
    ##創建數據庫鏈接
    engine = create_engine('oracle+cx_oracle://user:pass@localhost/orcl')
    cnx = engine.connect() 
    try:
        df.to_sql('anju_house', cnx,if_exists='append',index=False)
    except Exception as e:
        logging.error(traceback.format_exc())
    ##關閉數據連接
    cnx.close()oracle

相關文章
相關標籤/搜索