Spacy簡單入門

安裝Spacy

pip install spacy

導入工具包和英文模型

#python -m spacy download en

文本處理

import spacy
nlp=spacy.load('en')
#分詞
doc=nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
for token in doc:
    print(token)
#分句
for sent in doc.sents:
    print(sent)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.
Weather is good, very windy and sunny.
We have no classes in the afternoon.

詞性

for token in doc:
    print('{}-{}'.format(token,token.pos_))

Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-AUX
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT

命名實體識別

doc=nlp('I went to beijing where I met my old friend Jack from uni.')

for ent in doc.ents:
    print('{}-{}'.format(ent,ent.label_))
    
from spacy import displacy

displacy.render(doc,style='ent',jupyter=True)

beijing-GPE
Jack-PERSON

<span class="tex2jax_ignore"><div class="entities" style="line-height: 2.5; direction: ltr">I went to <mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"> beijing <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span> </mark> where I met my old friend <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"> Jack <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span> </mark> from uni.</div></span>python

找出文中全部的人名

def getFileContent(path):
    with open(path,'r') as f:
        return f.read()

doc=nlp(getFileContent('./data/pride_and_prejudice.txt'))
sents=[s for s in doc.sents]
print(len(sents))
from collections import Counter,defaultdict

def find_person(doc):
    c=Counter()
    for ent in doc.ents:
        if ent.label_=='PERSON':
            c[ent.lemma_]+=1
    return c.most_common(10)
print(find_person(doc))

7153
[('Elizabeth', 600), ('Darcy', 355), ('Jane', 277), ('Bingley', 260), ('Bennet', 258), ('Collins', 166), ('Wickham', 108), ('Lizzy', 94), ('Gardiner', 90), ('Lady Catherine', 76)]

恐怖襲擊分析

def read_lines(path):
    with open(path,'r') as f:
        return f.readlines()

text=read_lines('./data/rand-terrorism-dataset.txt')
nlp_list=[nlp(line)  for line in text]

common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]

location_entity_dict = defaultdict(Counter)

for article in nlp_list:
    
    article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者組織
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
    terrorist_common = [ent for ent in article_terrorist_groups if ent.lower() in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent.lower() in common_locations]
    
    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1

location_entity_dict

defaultdict(collections.Counter,
            {'PLO': Counter({'Beirut': 9,
                      'ISRAEL': 17,
                      'Israel': 21,
                      'Iraq': 8,
                      'Palestine': 1}),
             'Fatah': Counter({'Israel': 18,
                      'Beirut': 1,
                      'Iraq': 1,
                      'ISRAEL': 4,
                      'Gaza': 11}),
             'Hamas': Counter({'ISRAEL': 7,
                      'Israel': 19,
                      'Beirut': 1,
                      'Gaza': 70}),
             'Taliban': Counter({'AFGHANISTAN': 3,
                      'Kabul': 45,
                      'Pakistan': 17,
                      'Afghanistan': 263}),
             'HAMAS': Counter({'ISRAEL': 1}),
             'Al - Qaeda': Counter({'Kabul': 1,
                      'Iraq': 4,
                      'Israel': 1,
                      'Baghdad': 5,
                      'Pakistan': 1,
                      'Mosul': 16,
                      'Kirkuk': 2}),
             'al - Qaeda': Counter({'Iraq': 46,
                      'Afghanistan': 6,
                      'Kabul': 2,
                      'Istanbul': 3,
                      'Baghdad': 14,
                      'Palestine': 3,
                      'Mosul': 1,
                      'Kirkuk': 3,
                      'Pakistan': 5}),
             'Bilad al - Rafidayn': Counter({'Iraq': 21,
                      'Baghdad': 32,
                      'Basra': 4,
                      'Mosul': 4,
                      'Palestine': 6}),
             'taliban': Counter({'Kabul': 1})})

import pandas as pd

df=pd.DataFrame.from_dict(dict(location_entity_dict),dtype=int)

df=df.fillna(value=0).astype(int)

df

<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }ide

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>PLO</th> <th>Fatah</th> <th>Hamas</th> <th>Taliban</th> <th>HAMAS</th> <th>Al - Qaeda</th> <th>al - Qaeda</th> <th>Bilad al - Rafidayn</th> <th>taliban</th> </tr> </thead> <tbody> <tr> <th>Beirut</th> <td>9</td> <td>1</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>ISRAEL</th> <td>17</td> <td>4</td> <td>7</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Israel</th> <td>21</td> <td>18</td> <td>19</td> <td>0</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Iraq</th> <td>8</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>4</td> <td>46</td> <td>21</td> <td>0</td> </tr> <tr> <th>Palestine</th> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>6</td> <td>0</td> </tr> <tr> <th>Gaza</th> <td>0</td> <td>11</td> <td>70</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>AFGHANISTAN</th> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr> <tr> <th>Kabul</th> <td>0</td> <td>0</td> <td>0</td> <td>45</td> <td>0</td> <td>1</td> <td>2</td> <td>0</td> <td>1</td> </tr> <tr> <th>Pakistan</th> <td>0</td> <td>0</td> <td>0</td> <td>17</td> <td>0</td> <td>1</td> <td>5</td> <td>0</td> <td>0</td> </tr> <tr> <th>Afghanistan</th> <td>0</td> <td>0</td> <td>0</td> <td>263</td> <td>0</td> <td>0</td> <td>6</td> <td>0</td> <td>0</td> </tr> <tr> <th>Baghdad</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>5</td> <td>14</td> <td>32</td> <td>0</td> </tr> <tr> <th>Mosul</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>16</td> <td>1</td> <td>4</td> <td>0</td> </tr> <tr> <th>Kirkuk</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>2</td> <td>3</td> <td>0</td> <td>0</td> </tr> <tr> <th>Istanbul</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>3</td> <td>0</td> <td>0</td> </tr> <tr> <th>Basra</th> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>4</td> <td>0</td> </tr> </tbody> </table> </div>工具

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,10))
hmap=sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu',cbar=False)
plt.title('trror')
# x軸的旋轉30度
plt.xticks(rotation=30)
plt.show()