&&&&python
# -*- coding: utf-8 -* import xlrd import requests from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf8') import time def getCompanyList(): data = xlrd.open_workbook(u'企查查.xlsx') table = data.sheets()[0] nrows = table.nrows companyList =[] for i in xrange(0,nrows): rowValues= table.row_values(i) for item in rowValues: companyList.append(item) companyList.pop(0) #獲得全部公司名列表 return companyList def getUrl(companyName): url = "https://www.tianyancha.com/search?key=" + u'%s' % companyName headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36', 'Cookie':'TYCID=e445690096c411e7b5b4bfe4760c4e65; uccid=c7ae83a70ec8b2157746346fbb1c33ce; ssuid=5112913316; aliyungf_tc=AQAAAAQzsD4A4w0AVdXNfHCBi5H/1dg1; csrfToken=o9nXy2N8cgdG0_N5Iz8CYnt-; RTYCID=4ce5b0431c934dc3bbebe5695ab2863c; bannerFlag=true; token=5d20e41769f143dea5806c52bf6c953b; _utm=2a425496ba624a8482fa563771854afa; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUxNzcwNzkzNSIsImlhdCI6MTUyNjg5Njk1NCwiZXhwIjoxNTQyNDQ4OTU0fQ.daBETlYxMpGXywxEvJCOtHKDFDi0XN-UTJBT9OWe4OaVOfEp_iFMmw3o5mTc-CpDrnF-jCPwxvZK9Iobn9iwRw%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215517707935%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUxNzcwNzkzNSIsImlhdCI6MTUyNjg5Njk1NCwiZXhwIjoxNTQyNDQ4OTU0fQ.daBETlYxMpGXywxEvJCOtHKDFDi0XN-UTJBT9OWe4OaVOfEp_iFMmw3o5mTc-CpDrnF-jCPwxvZK9Iobn9iwRw; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1526366266,1526368826,1526889740,1526892761; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1526896940', } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'lxml') try: url = 'https://www.tianyancha.com/company/'+ soup.find('div',class_='title overflow-width').a['href'].split('-')[1][1:] except: print soup.find('div',class_='title overflow-width') print soup.prettify() return url def qiChaCha(companyName,url): headers = { 'Cookie':'TYCID=e445690096c411e7b5b4bfe4760c4e65; uccid=c7ae83a70ec8b2157746346fbb1c33ce; ssuid=5112913316; aliyungf_tc=AQAAAAQzsD4A4w0AVdXNfHCBi5H/1dg1; csrfToken=o9nXy2N8cgdG0_N5Iz8CYnt-; RTYCID=4ce5b0431c934dc3bbebe5695ab2863c; bannerFlag=true; token=5d20e41769f143dea5806c52bf6c953b; _utm=2a425496ba624a8482fa563771854afa; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUxNzcwNzkzNSIsImlhdCI6MTUyNjg5Njk1NCwiZXhwIjoxNTQyNDQ4OTU0fQ.daBETlYxMpGXywxEvJCOtHKDFDi0XN-UTJBT9OWe4OaVOfEp_iFMmw3o5mTc-CpDrnF-jCPwxvZK9Iobn9iwRw%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215517707935%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUxNzcwNzkzNSIsImlhdCI6MTUyNjg5Njk1NCwiZXhwIjoxNTQyNDQ4OTU0fQ.daBETlYxMpGXywxEvJCOtHKDFDi0XN-UTJBT9OWe4OaVOfEp_iFMmw3o5mTc-CpDrnF-jCPwxvZK9Iobn9iwRw; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1526366266,1526368826,1526889740,1526892761; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1526896940', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36', } response = requests.get(url,headers=headers) soup =BeautifulSoup(response.text,'lxml') companyType= soup.find('table',class_='table companyInfo-table f14').find_all('td')[8].string #公司類型 industry = soup.find('td',colspan="2").string #行業 registeredAddress = soup.find('td',colspan="4").text[0:-4] #註冊地址 registry = soup.find('table',class_='table companyInfo-table f14').find_all('td')[18].string #登記機構 return [companyName,companyType,industry,registeredAddress,registry] if __name__ == '__main__': start = time.time() companyList = getCompanyList() for companyName in companyList: url = getUrl(companyName) try: data = qiChaCha(companyName,url) print '企業:'+data[0]+' '+'公司類型:'+data[1]+' '+'行業: '+data[2]+' '+'註冊地址: '+data[3]+' '+'登記機構:' +data[4] except Exception as e: data = qiChaCha(companyName, url) print '企業:' + data[0] + ' ' + '公司類型:' + data[1] + '' + '行業: ' + data[2] + '' + '註冊地址: ' + data[3] + ' ' + '登記機構:' + data[4] end =time.time() print end - start
&&&&app