安裝Spacy
pip install spacy
導入工具包和英文模型
#python -m spacy download en
文本處理
import spacy nlp=spacy.load('en') #分詞 doc=nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.') for token in doc: print(token) #分句 for sent in doc.sents: print(sent)
Weather is good , very windy and sunny . We have no classes in the afternoon . Weather is good, very windy and sunny. We have no classes in the afternoon.
詞性
for token in doc: print('{}-{}'.format(token,token.pos_))
Weather-NOUN is-AUX good-ADJ ,-PUNCT very-ADV windy-ADJ and-CCONJ sunny-ADJ .-PUNCT We-PRON have-AUX no-DET classes-NOUN in-ADP the-DET afternoon-NOUN .-PUNCT
命名實體識別
doc=nlp('I went to beijing where I met my old friend Jack from uni.') for ent in doc.ents: print('{}-{}'.format(ent,ent.label_)) from spacy import displacy displacy.render(doc,style='ent',jupyter=True)
beijing-GPE Jack-PERSON
<span class="tex2jax_ignore"><div class="entities" style="line-height: 2.5; direction: ltr">I went to <mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"> beijing <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span> </mark> where I met my old friend <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"> Jack <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span> </mark> from uni.</div></span>python
找出文中全部的人名
def getFileContent(path): with open(path,'r') as f: return f.read() doc=nlp(getFileContent('./data/pride_and_prejudice.txt')) sents=[s for s in doc.sents] print(len(sents)) from collections import Counter,defaultdict def find_person(doc): c=Counter() for ent in doc.ents: if ent.label_=='PERSON': c[ent.lemma_]+=1 return c.most_common(10) print(find_person(doc))
7153 [('Elizabeth', 600), ('Darcy', 355), ('Jane', 277), ('Bingley', 260), ('Bennet', 258), ('Collins', 166), ('Wickham', 108), ('Lizzy', 94), ('Gardiner', 90), ('Lady Catherine', 76)]
恐怖襲擊分析
def read_lines(path): with open(path,'r') as f: return f.readlines() text=read_lines('./data/rand-terrorism-dataset.txt') nlp_list=[nlp(line) for line in text]
common_terrorist_groups = [ 'taliban', 'al - qaeda', 'hamas', 'fatah', 'plo', 'bilad al - rafidayn' ] common_locations = [ 'iraq', 'baghdad', 'kirkuk', 'mosul', 'afghanistan', 'kabul', 'basra', 'palestine', 'gaza', 'israel', 'istanbul', 'beirut', 'pakistan' ]
location_entity_dict = defaultdict(Counter) for article in nlp_list: article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者組織 article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE'] terrorist_common = [ent for ent in article_terrorist_groups if ent.lower() in common_terrorist_groups] locations_common = [ent for ent in article_locations if ent.lower() in common_locations] for found_entity in terrorist_common: for found_location in locations_common: location_entity_dict[found_entity][found_location] += 1
location_entity_dict
defaultdict(collections.Counter, {'PLO': Counter({'Beirut': 9, 'ISRAEL': 17, 'Israel': 21, 'Iraq': 8, 'Palestine': 1}), 'Fatah': Counter({'Israel': 18, 'Beirut': 1, 'Iraq': 1, 'ISRAEL': 4, 'Gaza': 11}), 'Hamas': Counter({'ISRAEL': 7, 'Israel': 19, 'Beirut': 1, 'Gaza': 70}), 'Taliban': Counter({'AFGHANISTAN': 3, 'Kabul': 45, 'Pakistan': 17, 'Afghanistan': 263}), 'HAMAS': Counter({'ISRAEL': 1}), 'Al - Qaeda': Counter({'Kabul': 1, 'Iraq': 4, 'Israel': 1, 'Baghdad': 5, 'Pakistan': 1, 'Mosul': 16, 'Kirkuk': 2}), 'al - Qaeda': Counter({'Iraq': 46, 'Afghanistan': 6, 'Kabul': 2, 'Istanbul': 3, 'Baghdad': 14, 'Palestine': 3, 'Mosul': 1, 'Kirkuk': 3, 'Pakistan': 5}), 'Bilad al - Rafidayn': Counter({'Iraq': 21, 'Baghdad': 32, 'Basra': 4, 'Mosul': 4, 'Palestine': 6}), 'taliban': Counter({'Kabul': 1})})
import pandas as pd df=pd.DataFrame.from_dict(dict(location_entity_dict),dtype=int) df=df.fillna(value=0).astype(int) df
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }ide
.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>PLO</th> <th>Fatah</th> <th>Hamas</th> <th>Taliban</th> <th>HAMAS</th> <th>Al - Qaeda</th> <th>al - Qaeda</th> <th>Bilad al - Rafidayn</th> <th>taliban</th> </tr> </thead> <tbody> <tr> <th>Beirut</th> <td>9</td> <td>1</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>ISRAEL</th> <td>17</td> <td>4</td> <td>7</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Israel</th> <td>21</td> <td>18</td> <td>19</td> <td>0</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Iraq</th> <td>8</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>4</td> <td>46</td> <td>21</td> <td>0</td> </tr> <tr> <th>Palestine</th> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>6</td> <td>0</td> </tr> <tr> <th>Gaza</th> <td>0</td> <td>11</td> <td>70</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>AFGHANISTAN</th> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Kabul</th> <td>0</td> <td>0</td> <td>0</td> <td>45</td> <td>0</td> <td>1</td> <td>2</td> <td>0</td> <td>1</td> </tr> <tr> <th>Pakistan</th> <td>0</td> <td>0</td> <td>0</td> <td>17</td> <td>0</td> <td>1</td> <td>5</td> <td>0</td> <td>0</td> </tr> <tr> <th>Afghanistan</th> <td>0</td> <td>0</td> <td>0</td> <td>263</td> <td>0</td> <td>0</td> <td>6</td> <td>0</td> <td>0</td> </tr> <tr> <th>Baghdad</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>5</td> <td>14</td> <td>32</td> <td>0</td> </tr> <tr> <th>Mosul</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>16</td> <td>1</td> <td>4</td> <td>0</td> </tr> <tr> <th>Kirkuk</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>2</td> <td>3</td> <td>0</td> <td>0</td> </tr> <tr> <th>Istanbul</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>0</td> <td>0</td> </tr> <tr> <th>Basra</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>4</td> <td>0</td> </tr> </tbody> </table> </div>工具
import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(12,10)) hmap=sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu',cbar=False) plt.title('trror') # x軸的旋轉30度 plt.xticks(rotation=30) plt.show()