對c++或者java熟悉的同窗,寫python代碼時一般會用c++,java方式.有些狀況下,用python的方法實現一些功能會更方便.java
# coding:utf-8 import os filepath = r"D:\test" files = [f for f in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, f))] print(files)
列出文件夾filepath下的全部文件名.一行代碼解決.同理能夠列出全部文件夾.注意,沒有列出子目錄的內容.python
# coding:utf-8 print(3 * [2]) # [2, 2, 2]
結果是一個數組.這個技巧在分類,聚類算法中,初始化類編號最經常使用.c++
常常初用來構建一個亂序的numpy類型的數組.算法
# coding:utf-8 import numpy as np random_state = np.random.RandomState(0) indices = np.arange(100) random_state.shuffle(indices) print(indices)
np.random.RandomState的參數同樣時,構造的數組必定同樣.不一樣的參數構建的數組必定不同.mongodb
python內部編碼只多是UCS-2,UCS-4中的某一種.sys.maxunicode爲65535時表示該版本內部編碼是unicode是UCS-2,sys.maxunicode爲1114111時, 表示該版本內部編碼是UCS-4.canvas
print(sys.maxunicode)
labels = [(1, 2), (3, 4), (5, 6)] labels, categories = zip(*labels) print(labels) print(categories)
能夠把一個元素組成的數組轉化成2個數組.也能夠把2個ndarray合成一個tuple數組
import numpy as np a = np.array( [[1, 2], [3, 4], [5, 6], [2, 3], [6, 9]] ) b = np.array([[1], [2], [3], [4], [5]]) for c in zip(a, b): print(c)
用numpy.in1d()能夠構建一個bool類型的數組.經過該數組,能夠把數組中的一些元素去掉.這在分類算法中去掉一些數據集時很是有用.app
# coding:utf-8 import numpy as np a = [1, 2, 3, 4, 5] b = [1, 4] mask = np.in1d(a, b) names = np.array(["aaa", "bbb", "ccc", "ddd", "fff"]) names = names[mask] print(names) # ['aaa' 'ddd']
# coding:utf-8 from urllib.request import urlopen URL = "http://download.labs.sogou.com/dl/sogoulabdown/categories_2012.txt" opener = urlopen(URL) with open("test.txt", 'wb') as f: f.write(opener.read())
在處理大文件時常常會遇到這個問題.求一大批文檔的tfidf時會產生一個很大但很稀疏的矩陣,而numpy的各類運算的參數又是numpy數組.不能把稀疏矩陣直接轉化成numpy數組(內在裝不上),解決方法是在預處理的時候把稀疏矩陣存成不少小文件,好比50行存成一個小文件,在訓練的時候每次讀取一個小文件.這就現實了小內存處理大文件.dom
import pickle import numpy as np fpath = r"D:\seri.dat" a = {} a['aaa'] = 1 a['bbb'] = 2 b = np.array([1, 2, 3]) with open(fpath, 'wb') as f: pickle.dump(b, f) # 把dict存成一個文件 with open(fpath, 'rb') as f: obj2 = pickle.load(f) print(obj2) # 把dict讀到內存中
建立矩陣函數
import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer corpus = np.array(["aaa bbb ccc", "aaa bbb ddd"]) cv1 = CountVectorizer() cv1output = cv1.fit_transform(corpus) print(cv1.get_feature_names()) tfidfTrans1 = TfidfTransformer() print(tfidfTrans1.fit_transform(cv1output))
tfidfTrans1就是最終的tfidf矩陣.這時候有一個測試集("aaa vvv ccc", "ccc ccc rrr"),注意vvv,rrr都不在訓練集中,要忽略.因此要以訓練集的單詞爲基準,創建測試矩陣.
import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer corpus = np.array(["aaa bbb ccc", "aaa bbb ddd"]) cv1 = CountVectorizer() cv1output = cv1.fit_transform(corpus) print(cv1.get_feature_names()) tfidfTrans1 = TfidfTransformer() print(tfidfTrans1.fit_transform(cv1output)) corpus1 = np.array(["aaa vvv ccc", "ccc ccc rrr"]) cv2 = CountVectorizer(vocabulary=cv1.vocabulary_) cv2output = cv2.fit_transform(corpus1) tfidfTrans2 = TfidfTransformer() print(tfidfTrans2.fit_transform(cv2output))
# coding:utf-8 import numpy as np def dense_to_one_hot(input_data, class_num): data_num = input_data.shape[0] # numpy.arange(num_labels)產生一個[0,1,2,3,4,5,6,7,8,9,0,1,3]的數組,* num_classes是把全部數乘以10 index_offset = np.arange(data_num) * class_num # [0,10,20,30,40,50,60,70,80,90,100,110,120] labels_one_hot = np.zeros((data_num, class_num)) # (13*10)的數組 # index_offset [0,10,20,30,40,50,60,70,80,90,100,110, 120] # input_data.ravel() [0,1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 3] # sum [0,11,22,33,44,55,66,77,88,99,100, 111, 123] labels_one_hot.flat[index_offset + input_data.ravel()] = 1 return labels_one_hot input_data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 3]) class_num = 10 print(dense_to_one_hot(input_data, class_num))
tuple能夠表示不變集合,例如,一個點的二維座標就能夠表示成:
p = (1, 2)
可是,看到(1, 2),很難看出這個tuple是用來表示一個座標的.定義一個class又小題大作了,這時,namedtuple就派上了用場:
from collections import namedtuple Point = namedtuple('Point', ['x', 'y']) p = Point(1, 2) print(p.x) print(p.y)
參考資料:廖雪峯的官方網站
*args表示傳遞不定長的參數.
def fun_var_args(farg, *args): print("arg:", farg) for value in args: print("another arg:", value) fun_var_args(1, "two", 3) # *args能夠看成可容納多個變量組成的list
**kwargs也表示傳遞不定長的參數.和*args的區別是**kwargs傳的是key, value的結構.
def fun_var_kwargs(farg, **kwargs): print("arg:", farg) for key in kwargs: print("another keyword arg: %s: %s" % (key, kwargs[key])) fun_var_kwargs(farg=1, myarg2="two", myarg3=3) # myarg2和myarg3被視爲key, 感受**kwargs能夠看成容納多個key和value的dictionary
原生的方法
# coding:utf-8 import sys print(sys.argv[1]) print(sys.argv[2])
用tensorflow的方法
# coding:utf-8 import tensorflow as tf flags = tf.app.flags flags.DEFINE_string("zipfilepath", "a", "zip file path") flags.DEFINE_string("unzipfolder", "b", "unzip folder") FLAGS = flags.FLAGS print(FLAGS.zipfilepath) print(FLAGS.unzipfolder)
import zipfile zip_ref = zipfile.ZipFile(r"D:\test.zip") zip_ref.extractall(r"D:\unfolder") zip_ref.close()
python setup.py sdist --formats=gztar
Z[:] = [0 if x > 0.5 else 1 for x in Z]
a = dict() a[1] = 2 a[2] = 3 a[3] = 4 d = {v: k for k, v in a.items()} print(d)
import itertools a = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] print(list(itertools.chain.from_iterable(a)))
import numpy as np matrix = np.random.random([1024, 64]) # 64-dimensional embeddings ids = np.array([0, 5, 17, 33]) print(matrix[ids].shape) # prints a matrix of shape [4, 64]
能夠對比tensorflow的tf.nn.embedding_lookup功能
import platform if platform.python_version().startswith("3"): print("a")
import numpy as np a = np.random.rand(10) print(a) b = np.around(a) print(b.astype(int))
data = dict() data[1] = 2 data[13] = 1 data[5] = 9 count_pairs = sorted(data.items()) print(count_pairs) count_pairs = sorted(data.items(), key=lambda x: (x[1], x[0])) print(count_pairs)
import numpy as np def rand_arr(a, b, *args): np.random.seed(0) return np.random.rand(*args) * (b - a) + a a = rand_arr(0, 1, 2, 3) print(a)
雙星表示平方
import numpy as np a = np.array([1, 2, 3]) print(a ** a)
輸出結果爲[ 1 4 27]
import numpy as np a = np.random.choice(2, 50000, p=[0.5, 0.5]) print(len(a)) print(a[0: 10])
import numpy as np x = np.arange(10) print(x) print(np.roll(x, 3))
import numpy as np arr1 = np.arange(12).reshape(2, 2, 3) print("---------------------------------轉換前---------------------------------") print(arr1) print("---------------------------------轉換後---------------------------------") print(arr1.transpose((1, 0, 2))) arr1 = np.arange(12).reshape(2, 2, 3) print("---------------------------------轉換前---------------------------------") print(arr1) print("---------------------------------轉換後---------------------------------") print(arr1.transpose((0, 2, 1)))
用"數組的數組"來理解多維數組.arr1[2][2][3]是一個有2個元素的數組,每一個元素又是長度爲2的數組,而長度爲2的數組的每一個元素又是一個長度爲3的數組.arr1.transpose((1, 0, 2))的意思是第3維不變.能夠這樣認爲,arr1原始的結構以下:
\[ \begin{equation*} \left[ \begin{array}{cc} A & B \\ C & D \\ \end{array} \right] \end{equation*} \]
其中A=[0, 1, 2],B=[3, 4, 5],C=[6, 7, 8],D=[9, 10, 11],如今要轉置第1維和第2維,因此轉後爲
\[ \begin{equation*} \left[ \begin{array}{cc} A & C \\ B & D \\ \end{array} \right] \end{equation*} \]
A,B,C,D的內容不變,這就解釋了arr1.transpose((1, 0, 2)的值.用相似的思路能夠解釋arr1.transpose((0, 2, 1)的值.對於arr1.transpose((0, 2, 1)能夠認爲是第1維不變,第2,3維轉置.第1維的每一個元素都是一個2*3的矩陣,轉置後變成3*2,這就解釋了arr1.transpose((0, 2, 1)的輸出.
一般狀況下從dict中按key取一個值,若是key不存在會報錯.能夠用defaultdict定義dict,key不存在時不會報錯.
from collections import defaultdict a = defaultdict(int) a["3"] = 1 print(a["3"]) print(a["45"])
print('{0:.2f} finished. Epoch {1}'.format(1.1234, 2.3354)) g = "{0:.2f}, {1}".format(1.1234, "aa")
能夠像使用屬性那樣用函數.
# coding:utf-8 class Person(object): def __init__(self, first_name, last_name): """Constructor""" self.first_name = first_name self.last_name = last_name @property def full_name(self): return "%s %s" % (self.first_name, self.last_name) person = Person("zhang", "san") print(person.full_name) # 若是去掉 @property就顯示不出來full name
class Person(object): def __init__(self, name, gender): self.name = name self.gender = gender def __call__(self, friend): print('My name is %s...' % self.name) print('My friend is %s...' % friend) p = Person('Bob', 'male') p('Tim') # 對象能夠看成方法使用,調用的是__call__函數
# coding:utf-8 import os command = 'ps a' with os.popen(command) as p: info = p.read() print(info)
# coding:utf-8 from __future__ import division from __future__ import absolute_import from __future__ import print_function import hashlib s1 = "中華人民共和國" s2 = "美國" print(hashlib.md5(s1.encode("utf-8")).hexdigest()) print(hashlib.md5(s1.encode("utf-8")).hexdigest()) print(hashlib.md5(s2.encode("utf-8")).hexdigest())
# coding:utf-8 from __future__ import division from __future__ import absolute_import from __future__ import print_function from pymongo import MongoClient client = MongoClient('localhost', 27017) db = client.weichat db.docs.insert_one( {"class_type": "canvas", "content": "春江潮水連海平", })
a = "aaa bbb ccc ddd eee aaa bbb aaa aaa" print(a.count("aaa"))
# coding:utf-8 class MyClass: def __init__(self): self.name = "xiaohua" def process(self): return self.name t = MyClass() print(hasattr(t, "name")) # name屬性是否存在 print(hasattr(t, "process")) # process屬性是否存在 print(getattr(t, "name")) # 獲取name屬性值,存在就打印出來 print(getattr(t, "process")) # 獲取run方法,存在就打印出方法的內存地址 print(getattr(t, "process")()) # 獲取process方法,後面加括號能夠將這個方法運行 print(getattr(t, "age", "18"))
# coding:utf-8 import platform print(platform.platform())
# coding:utf-8 from __future__ import print_function import argparse def build_parser(): parser = argparse.ArgumentParser() parser.add_argument('--run_type', type=str, required=True) args = parser.parse_args() return args if __name__ == '__main__': args = build_parser() if args.run_type == "train": print("train") else: print("test")
# coding:utf-8 import re import jieba WORD_FORMAT = r"[\u4e00-\u9fa5A-Za-z]+$" content = "咱們都有一個家,名字叫中國08" seg_list = jieba.cut(content) pattern = re.compile(WORD_FORMAT) doc = " ".join(word for word in seg_list if pattern.search(word)) print(doc)
m = ["a", "b", "c", "d", "e", "f"] print(m[::-1])
m = ["a", "b", "c", "d", "e", "f"] print(m[-3:])