問題導讀:
1.數據如何處理?
2.從數據中如何提取有用的特徵?
3.有哪些衍生特徵?
數據處理以及轉化
python
填充壞的或丟失的數據:
程序員
1
2
3
|
years_pre_processed
=
movie_fields.
map
(
lambda
fields: fields[
2
]).
map
(
lambda
x: convert_year(x)).collect()
years_pre_processed_array
=
np.array(years_pre_processed)
|
01
02
03
04
05
06
07
08
09
10
|
mean_year
=
np.mean(years_pre_processed_array[years_pre_processed_
array!
=
1900
])
median_year
=
np.median(years_pre_processed_array[years_pre_processed_
array!
=
1900
])
index_bad_data
=
np.where(years_pre_processed_array
=
=
1900
)[
0
][
0
]
years_pre_processed_array[index_bad_data]
=
median_year
print
"Mean year of release: %d"
%
mean_year
print
"Median year of release: %d"
%
median_year
print
"Index of '1900' after assigning median: %s"
%
np.where(years_
pre_processed_array
=
=
1900
)[
0
]
|
1
2
3
|
Mean year of release:
1989
Median year of release:
1995
Index of
'1900'
after assigning median: []
|
從數據中提取有用的特徵
算法
特徵數據能夠分爲如下幾類:
express
數字特徵
編程
分類特徵
數組
1
2
3
|
all_occupations
=
user_fields.
map
(
lambda
fields: fields[
3
]).
distinct().collect()
all_occupations.sort()
|
1
2
3
4
5
6
7
8
9
|
idx
=
0
all_occupations_dict
=
{}
for
o
in
all_occupations:
all_occupations_dict[o]
=
idx
idx
+
=
1
# try a few examples to see what "1-of-k" encoding is assigned
print
"Encoding of 'doctor': %d"
%
all_occupations_dict[
'doctor'
]
print
"Encoding of 'programmer': %d"
%
all_occupations_
dict
[
'programmer'
]
|
1
2
|
Encoding of
'doctor'
:
2
Encoding of
'programmer'
:
14
|
1
2
3
4
5
6
|
K
=
len
(all_occupations_dict)
binary_x
=
np.zeros(K)
k_programmer
=
all_occupations_dict[
'programmer'
]
binary_x[k_programmer]
=
1
print
"Binary feature vector: %s"
%
binary_x
print
"Length of binary vector: %d"
%
K
|
1
2
3
|
Binary feature vector: [
0.
0.
0.
0.
0.
0.
0.
0.
0.
0.
0.
0.
0.
0.
1.
0.
0.
0.
0.
0.
0.
]
Length of binary vector:
21
|
衍生特徵
app
轉化timestamps值爲分類特徵值
dom
1
2
3
|
def
extract_datetime(ts):
import
datetime
return
datetime.datetime.fromtimestamp(ts)
|
1
2
3
|
timestamps
=
rating_data.
map
(
lambda
fields:
int
(fields[
3
]))
hour_of_day
=
timestamps.
map
(
lambda
ts: extract_datetime(ts).hour)
hour_of_day.take(
5
)
|
1
|
[
17
,
21
,
9
,
7
,
7
]
|
01
02
03
04
05
06
07
08
09
10
11
|
def
assign_tod(hr):
times_of_day
=
{
'morning'
:
range
(
7
,
12
),
'lunch'
:
range
(
12
,
14
),
'afternoon'
:
range
(
14
,
18
),
'evening'
:
range
(
18
,
23
),
'night'
:
range
(
23
,
7
)
}
for
k, v
in
times_of_day.iteritems():
if
hr
in
v:
return
k
|
1
2
|
time_of_day
=
hour_of_day.
map
(
lambda
hr: assign_tod(hr))
time_of_day.take(
5
)
|
1
|
[
'afternoon'
,
'evening'
,
'morning'
,
'morning'
,
'morning'
]
|
文本特徵值
機器學習
簡單文本特徵提取
函數
1
|
1
|Toy Story (
1995
)|
01
-
Jan
-
1995
||[url
=
http:
/
/
us.imdb.com
/
M
/
title
-
exact?Toy
%
20Story
%
20
]http:
/
/
us.imdb.com
/
M
/
title
-
exact?Toy
%
20Story
%
20
[
/
url](
1995
)|
0
|
0
|
0
|
1
|
1
|
1
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
0
|
01
02
03
04
05
06
07
08
09
10
|
def
extract_title(raw):
import
re
# this regular expression finds the non-word (numbers) between
parentheses
grps
=
re.search(
"(\w+)"
, raw)
if
grps:
# we take only the title part, and strip the trailingwhite spacefrom the remaining text, below
return
raw[:grps.start()].strip()
else
:
return
raw
|
1
2
|
/
/
包含電影發佈年限,格式:Toy Story (
1995
)
raw_titles
=
movie_fields.
map
(
lambda
fields: fields[
1
])
|
1
2
|
for
raw_title
in
raw_titles.take(
5
):
print
extract_title(raw_title)
|
1
2
3
4
5
|
Toy Story
GoldenEye
Four Rooms
Get Shorty
Copycat
|
1
2
3
4
5
|
movie_titles
=
raw_titles.
map
(
lambda
m: extract_title(m))
# next we tokenize the titles into terms. We'll use simple whitespace
tokenization
title_terms
=
movie_titles.
map
(
lambda
t: t.split(
" "
))
print
title_terms.take(
5
)
|
1
|
[[u
'Toy'
, u
'Story'
], [u
'GoldenEye'
], [u
'Four'
, u
'Rooms'
], [u
'Get'
,u
'Shorty'
], [u
'Copycat'
]]
|
01
02
03
04
05
06
07
08
09
10
|
# next we would like to collect all the possible terms, in order to
build out dictionary of term <
-
> index mappings
all_terms
=
title_terms.flatMap(
lambda
x: x).distinct().collect()
# create a new dictionary to hold the terms, and assign the "1-of-k"
indexes
idx
=
0
all_terms_dict
=
{}
for
term
in
all_terms:
all_terms_dict[term]
=
idx
idx
+
=
1
|
1
2
3
|
print
"Total number of terms: %d"
%
len
(all_terms_dict)
print
"Index of term 'Dead': %d"
%
all_terms_dict[
'Dead'
]
print
"Index of term 'Rooms': %d"
%
all_terms_dict[
'Rooms'
]
|
1
2
3
|
Total number of terms:
2645
Index of term
'Dead'
:
147
Index of term
'Rooms'
:
1963
|
1
2
3
4
|
all_terms_dict2
=
title_terms.flatMap(
lambda
x: x).distinct().
zipWithIndex().collectAsMap()
print
"Index of term 'Dead': %d"
%
all_terms_dict2[
'Dead'
]
print
"Index of term 'Rooms': %d"
%
all_terms_dict2[
'Rooms'
]
|
1
2
|
Index of term
'Dead'
:
147
Index of term
'Rooms'
:
1963
|
01
02
03
04
05
06
07
08
09
10
11
12
|
# this function takes a list of terms and encodes it as a scipy sparse
vector using an approach
# similar to the 1-of-k encoding
def
create_vector(terms, term_dict):
from
scipy
import
sparse as sp
num_terms
=
len
(term_dict)
x
=
sp.csc_matrix((
1
, num_terms))
for
t
in
terms:
|