url = "http://m.weather.com.cn/data/%s.html" path = "./conf/codeAPI.txt" def main(): for li in open(path, "r"): for l in re.findall(r'[\d]+', li): try: stream=urllib2.urlopen(url%l.strip()) weatherAnalysis.xmlBuilder(stream.read()) except Exception, e: continue
root = etree.Element("weatherinfos") data_xml = "data/weatherinfo%s.xml" splitor='~' subffixlow='L' subffixhigh='H' def xmlBuilder(f): js = json.loads(f) root.append(jsonAnalyser(js["weatherinfo"])) out= open(data_xml%datetime.datetime.now().strftime("%Y-%m-%d-%Hh"), "w") out.write(etree.tostring(root, pretty_print=True, encoding='utf-8')) def jsonAnalyser(js): element = etree.Element("weatherinfo", city=js["city"], city_en=js["city_en"]\ , date=js["date"], week=js["week"]) for x in range(1,6): d=datetime.datetime.now()+datetime.timedelta(hours=4*(x-1)) time_range= etree.Element("time_range") element.append(time_range) etree.SubElement(time_range, "time").text=d.strftime("%H") tempC=js["temp"+str(x)].split(splitor) etree.SubElement(time_range, "tempCL").text=tempC[0]+subffixlow etree.SubElement(time_range, "tempCH").text=tempC[1]+subffixhigh tempC=js["tempF"+str(x)].split(splitor) etree.SubElement(time_range, "tempFL").text=tempC[0]+subffixlow etree.SubElement(time_range, "tempFH").text=tempC[1]+subffixhigh etree.SubElement(time_range, "weather").text=js["weather"+str(x)] etree.SubElement(time_range, "wind").text=js["wind"+str(x)] return element解析後的某個城市的數據:
<weatherinfo city="北京" city_en="beijing" date="" week="星期日"> <time_range> <time>22</time> <tempCL>16℃L</tempCL> <tempCH>30℃H</tempCH> <tempFL>60.8℉L</tempFL> <tempFH>86℉H</tempFH> <weather>晴</weather> <wind>微風</wind> </time_range> <time_range> <time>02</time> <tempCL>17℃L</tempCL> <tempCH>29℃H</tempCH> <tempFL>62.6℉L</tempFL> <tempFH>84.2℉H</tempFH> <weather>多雲</weather> <wind>微風</wind> </time_range> <time_range> <time>06</time> <tempCL>17℃L</tempCL> <tempCH>25℃H</tempCH> <tempFL>62.6℉L</tempFL> <tempFH>77℉H</tempFH> <weather>多雲轉小雨</weather> <wind>微風</wind> </time_range> <time_range> <time>10</time> <tempCL>15℃L</tempCL> <tempCH>26℃H</tempCH> <tempFL>59℉L</tempFL> <tempFH>78.8℉H</tempFH> <weather>小雨轉陰</weather> <wind>微風轉北風3-4級</wind> </time_range> <time_range> <time>14</time> <tempCL>15℃L</tempCL> <tempCH>30℃H</tempCH> <tempFL>59℉L</tempFL> <tempFH>86℉H</tempFH> <weather>晴</weather> <wind>微風</wind> </time_range> </weatherinfo>
Apriori algorithm是關聯規則裏一項基本算法。是由Rakesh Agrawal和Ramakrishnan Srikant兩位博士在1994年提出的關聯規則挖掘算法。關聯規則的目的就是在一個數據集中找出項與項之間的關係,也被稱爲購物藍分析 (Market Basket analysis),由於「購物藍分析」很貼切的表達了適用該算法情景中的一個子集。關於這個算法有一個很是有名的故事:"尿布和啤酒"。故事是這樣的:美國的婦女們常常會囑咐她們的丈夫下班後爲孩子買尿布,而丈夫在買完尿布後又要順 手買回本身愛喝的啤酒,所以啤酒和尿布在一塊兒被購買的機會不少。這個舉措使尿布和啤酒的銷量雙雙增長,並一直爲衆商家所津津樂道。 html
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet): #把數據中大於最小支持度的項組成itemSet返回 def joinSet(itemSet,length): #這個功能就是所謂的對一個集合笛卡爾積去掉重複的部分 def getItemSetTransactionList(data_iterator): #把數據轉換成爲Set和list<Set>的形式 def runApriori(data_iter, minSupport, minConfidence): """ run the apriori algorithm. data_iter is a record iterator Return both: - items (tuple, support) - rules ((pretuple, posttuple), confidence) """ itemSet, transactionList = getItemSetTransactionList(xmlAnalysis.parseWeatherXML(data_iter)) freqSet = defaultdict(int) largeSet = dict() # Global dictionary which stores (key=n-itemSets,value=support) which satisfy minSupport assocRules = dict() # Dictionary which stores Association Rules oneCSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet) currentLSet = oneCSet k = 2 while(currentLSet != set([])): largeSet[k-1] = currentLSet currentLSet = joinSet(currentLSet,k) currentCSet = returnItemsWithMinSupport(currentLSet, transactionList, minSupport, freqSet) currentLSet = currentCSet k = k + 1 toRetItems=[] for key,value in largeSet.items(): toRetItems.extend([(tuple(item), getSupport(item)) for item in value]) toRetRules=[] for key,value in largeSet.items()[1:]: for item in value: _subsets = map(frozenset,[x for x in subsets(item)]) for element in _subsets: remain = item.difference(element) if len(remain)>0: confidence = getSupport(item)/getSupport(element) if confidence >= minConfidence: toRetRules.append(((tuple(element),tuple(remain)), confidence)) return toRetItems, toRetRules