Spacy簡單入門

安裝Spacy

pip install spacy

導入工具包和英文模型

#python -m spacy download en

文本處理

import spacy
nlp=spacy.load('en')
#分詞
doc=nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
for token in doc:
    print(token)
#分句
for sent in doc.sents:
    print(sent)
Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.
Weather is good, very windy and sunny.
We have no classes in the afternoon.

詞性

for token in doc:
    print('{}-{}'.format(token,token.pos_))
Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-AUX
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT

命名實體識別

doc=nlp('I went to beijing where I met my old friend Jack from uni.')

for ent in doc.ents:
    print('{}-{}'.format(ent,ent.label_))
    
from spacy import displacy

displacy.render(doc,style='ent',jupyter=True)
beijing-GPE
Jack-PERSON

<span class="tex2jax_ignore"><div class="entities" style="line-height: 2.5; direction: ltr">I went to
<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">python

beijing
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span>

</mark>
where I met my old friend
<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">ide

Jack
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span>

</mark>
from uni.</div></span>工具

## 找出文中全部的人名spa

def getFileContent(path):
    with open(path,'r') as f:
        return f.read()

doc=nlp(getFileContent('./data/pride_and_prejudice.txt'))
sents=[s for s in doc.sents]
print(len(sents))
from collections import Counter,defaultdict

def find_person(doc):
    c=Counter()
    for ent in doc.ents:
        if ent.label_=='PERSON':
            c[ent.lemma_]+=1
    return c.most_common(10)
print(find_person(doc))
7153
[('Elizabeth', 600), ('Darcy', 355), ('Jane', 277), ('Bingley', 260), ('Bennet', 258), ('Collins', 166), ('Wickham', 108), ('Lizzy', 94), ('Gardiner', 90), ('Lady Catherine', 76)]

恐怖襲擊分析

def read_lines(path):
    with open(path,'r') as f:
        return f.readlines()

text=read_lines('./data/rand-terrorism-dataset.txt')
nlp_list=[nlp(line)  for line in text]
common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]
location_entity_dict = defaultdict(Counter)

for article in nlp_list:
    
    article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者組織
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
    terrorist_common = [ent for ent in article_terrorist_groups if ent.lower() in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent.lower() in common_locations]
    
    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1
location_entity_dict
defaultdict(collections.Counter,
            {'PLO': Counter({'Beirut': 9,
                      'ISRAEL': 17,
                      'Israel': 21,
                      'Iraq': 8,
                      'Palestine': 1}),
             'Fatah': Counter({'Israel': 18,
                      'Beirut': 1,
                      'Iraq': 1,
                      'ISRAEL': 4,
                      'Gaza': 11}),
             'Hamas': Counter({'ISRAEL': 7,
                      'Israel': 19,
                      'Beirut': 1,
                      'Gaza': 70}),
             'Taliban': Counter({'AFGHANISTAN': 3,
                      'Kabul': 45,
                      'Pakistan': 17,
                      'Afghanistan': 263}),
             'HAMAS': Counter({'ISRAEL': 1}),
             'Al - Qaeda': Counter({'Kabul': 1,
                      'Iraq': 4,
                      'Israel': 1,
                      'Baghdad': 5,
                      'Pakistan': 1,
                      'Mosul': 16,
                      'Kirkuk': 2}),
             'al - Qaeda': Counter({'Iraq': 46,
                      'Afghanistan': 6,
                      'Kabul': 2,
                      'Istanbul': 3,
                      'Baghdad': 14,
                      'Palestine': 3,
                      'Mosul': 1,
                      'Kirkuk': 3,
                      'Pakistan': 5}),
             'Bilad al - Rafidayn': Counter({'Iraq': 21,
                      'Baghdad': 32,
                      'Basra': 4,
                      'Mosul': 4,
                      'Palestine': 6}),
             'taliban': Counter({'Kabul': 1})})
import pandas as pd

df=pd.DataFrame.from_dict(dict(location_entity_dict),dtype=int)

df=df.fillna(value=0).astype(int)

df

<div>
<style scoped>code

.dataframe tbody tr th:only-of-type {
    vertical-align: middle;
}

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>orm

PLO Fatah Hamas Taliban HAMAS Al - Qaeda al - Qaeda Bilad al - Rafidayn taliban
Beirut 9 1 1 0 0 0 0 0 0
ISRAEL 17 4 7 0 1 0 0 0 0
Israel 21 18 19 0 0 1 0 0 0
Iraq 8 1 0 0 0 4 46 21 0
Palestine 1 0 0 0 0 0 3 6 0
Gaza 0 11 70 0 0 0 0 0 0
AFGHANISTAN 0 0 0 3 0 0 0 0 0
Kabul 0 0 0 45 0 1 2 0 1
Pakistan 0 0 0 17 0 1 5 0 0
Afghanistan 0 0 0 263 0 0 6 0 0
Baghdad 0 0 0 0 0 5 14 32 0
Mosul 0 0 0 0 0 16 1 4 0
Kirkuk 0 0 0 0 0 2 3 0 0
Istanbul 0 0 0 0 0 0 3 0 0
Basra 0 0 0 0 0 0 0 4 0

</div>blog

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,10))
hmap=sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu',cbar=False)
plt.title('trror')
# x軸的旋轉30度
plt.xticks(rotation=30)
plt.show()

output_18_0

相關文章
相關標籤/搜索