1.用Python經過csv文件裏面的某一列,造成鍵值,而後統計鍵在其餘列出現的次數。
import pandas as pd
import numpy as np
import csv
import codecs
import sys
data_original = pd.read_csv('D:/csv_data_original.csv')
data = pd.read_csv('D:/week1.csv')
#data = data['retweeted_status_mid'].fillna('NOT PROVIDED',inplace=True)
#data_transpond = data[data['retweeted_status_mid'] != 'NOT PROVIDED']
#每條原創微博轉發次數統計
def statistics(path1,path2):
num1 = 0
num2 = 0
#這塊代碼用來造成鍵值,初始化爲0
with open(path2, 'r', encoding="iso-8859-1") as f:
reader2 = csv.reader(f)
data_head2 = next(reader2)
print(data_head2)
data_line = next(reader2)
while(data_line):
if data_line[0] not in mid.keys():
mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
num2 += 1
print("正在建立第" + str(num2) + "個鍵")
try:
data_line = next(reader2)
except StopIteration:
print("數據處理完畢,鍵值徹底造成" + str(num2) + "!")
break
#sys.exit()
f.close()
#這塊代碼用來統計每一個鍵出現的次數
with open(path1, 'r', encoding="iso-8859-1") as f:
reader1 = csv.reader(f)
data_head1 = next(reader1)
print(data_head1)
data_line = next(reader1)
while(data_line):
if data_line[1] in mid.keys():
mid[data_line[1].encode("iso-8859-1").decode("gbk", "ignore")] += 1
print("這條微博被轉發" + str(mid[data_line[1]]) + "次")
try:
data_line = next(reader1)
except StopIteration:
print("數據處理完畢,轉發次數統計完畢")
break
#sys.exit()
f.close()
#字典轉化爲列表
def transpond(dict):
global list_key#保存鍵
global list_value#保存值
list_key = list(dict)
list_value = list(dict.values())
#將數據寫入csv文件
def data_write_csv(file_name, list1,list2):#file_name爲寫入CSV文件的路徑,datas爲要寫入數據列表
with open(file_name,'w',newline='') as f:
writer = csv.writer(f)
writer.writerows(zip(list1, list2))
if __name__ == "__main__":
path_data = 'D:/week1.csv' # 原始數據路徑
path_data_original = 'D:/csv_data_original.csv' # 處理後只含原創的微博數據路徑
path_save = 'D:/transpond_data.csv' # 保存處理後的數據
mid = {} # 定義字典用來保存每條原創微博被轉發的次數
list_key = [] # 保存鍵
list_value = [] # 保存值
statistics(path_data,path_data_original)
transpond(mid)
data_write_csv(path_save,list_key,list_value)
2.與1相似的操做,具體有一些細節變更,代碼中有註釋
import csv
import pandas as pd
#每條原創微博轉發次數統計
def statistics(path1,path2):
num2 = 0
#這塊代碼用來造成鍵值,初始化爲0
with open(path2, 'r', encoding="iso-8859-1") as f:
reader2 = csv.reader(f)
data_head2 = next(reader2)
print(data_head2)
data_line = next(reader2)
while(data_line):
if data_line[0] not in mid.keys():
mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
num2 += 1
print("正在建立第" + str(num2) + "個鍵")
try:
data_line = next(reader2)
except StopIteration:
print("數據處理完畢,鍵值徹底造成" + str(num2) + "!")
break
#sys.exit()
f.close()
#這塊代碼用來統計每一個鍵出現的次數
with open(path1, 'r', encoding="iso-8859-1") as f:
reader1 = csv.reader(f)
data_head1 = next(reader1)
print(data_head1)
data_line = next(reader1)
while(data_line):
if data_line[2] in mid.keys():
mid[data_line[2].encode("iso-8859-1").decode("gbk", "ignore")] += int(data_line[1])
print("這個用戶的微博被轉發一共" + str(mid[data_line[2]]) + "次")
try:
data_line = next(reader1)
except StopIteration:
print("數據處理完畢,轉發次數統計完畢")
break
#sys.exit()
f.close()
#字典轉化爲列表
def transpond(dict):
global list_key#保存鍵
global list_value#保存值
list_key = list(dict)
list_value = list(dict.values())
#將數據寫入csv文件
def data_write_csv(file_name, list1,list2):#file_name爲寫入CSV文件的路徑,datas爲要寫入數據列表
with open(file_name,'w',newline='') as f:
writer = csv.writer(f)
writer.writerows(zip(list1, list2))
if __name__ == '__main__':
path1 = 'D:/csv_data_original_num.csv' # 用來造成鍵的數據路徑
path2 = 'D:/data_all.csv' # 用來查找鍵值的數據路徑
path_save = 'D:/user_transpond.csv' # 存放統計好的數據路徑
mid = {}
list_key = []
list_value = []
statistics(path2,path1)
transpond(mid)
data_write_csv(path_save,list_key,list_value)
3.將大數據的csv文件根據特定條件分紅幾份小文件
#coding = utf-8
import pandas as pd
import csv
def get_txt(path1,path2,path3,path4,path5,path6,path7,path8):
num = 0
with open(path1, 'r',encoding = 'utf-8') as f:
txt1 = open(path2, "w", encoding='utf-8')
txt2 = open(path3, "w", encoding='utf-8')
txt3 = open(path4, "w", encoding='utf-8')
txt4 = open(path5, "w", encoding='utf-8')
txt5 = open(path6, "w", encoding='utf-8')
txt6 = open(path7, "w", encoding='utf-8')
txt7 = open(path8, "w", encoding='utf-8')
reader1 = csv.reader(f)
data_head1 = next(reader1)
print(data_head1)
data_line = next(reader1)
while(data_line):
num += 1
print(num)
print(data_line[6])
if num > 0 and num < 700000:
txt1.write(data_line[6] + '\n')
elif num >= 700000 and num < 1400000:
txt2.write(data_line[6] + '\n')
elif num >= 1400000 and num < 2100000:
txt3.write(data_line[6] + '\n')
elif num >= 2100000 and num < 2800000:
txt4.write(data_line[6] + '\n')
elif num >= 2800000 and num < 3500000:
txt5.write(data_line[6] + '\n')
elif num >= 3500000 and num < 4200000:
txt6.write(data_line[6] + '\n')
elif num >= 4200000 and num < 4700000:
txt7.write(data_line[6] + '\n')
try:
data_line = next(reader1)
except StopIteration:
print("數據處理完畢,轉發次數統計完畢")
break
#sys.exit()
f.close()
if __name__ == '__main__':
path1 = 'D:/week1.csv'
path2 = 'D:/text1.txt'
path3 = 'D:/text2.txt'
path4 = 'D:/text3.txt'
path5 = 'D:/text4.txt'
path6 = 'D:/text5.txt'
path7 = 'D:/text6.txt'
path8 = 'D:/text7.txt'
get_txt(path1,path2,path3,path4,path5,path6,path7,path8)