bash Anaconda2-4.1.1-Linux-x86_64.sh
$ sudo apt-get install software-properties-common $ sudo add-apt-repository ppa:webupd8team/java $ sudo apt-get update $ sudo apt-get install oracle-java8-installer
gedit .bashrc
JAVA_HOME=/usr/lib/jvm/java-8-oracle export JAVA_HOME PATH=$PATH:$JAVA_HOME export PATH
$ tar -zxvf spark-2.0.0-bin-hadoop2.7.tgz
$ rm spark-2.0.0-bin-hadoop2.7.tgz
gedit .bashrc
export PYSPARK_DRIVER_PYTHON=ipython
export PYSPARK_DRIVER_PYTHON_OPTS=notebook
cd ~/spark-2.0.0-bin-hadoop2.7
./bin/pyspark
# coding: utf-8 # In[1]: import re from operator import add # In[13]: file_in = sc.textFile("/home/carl/spark/NOTICE") # In[3]: words = file_in.flatMap(lambda line: re.split(' ', line.lower().strip())) # In[4]: words = words.filter(lambda w: len(w) > 3) # In[5]: words = words.map(lambda w:(w,1)) # In[6]: words = words.reduceByKey(add) # In[7]: words = words.map(lambda x: (x[1], x[0])).sortByKey(False) # In[8]: words.take(15) # In[9]: get_ipython().magic(u'matplotlib inline') import matplotlib.pyplot as plt def histogram(words): count = map(lambda x: x[1], words) word = map(lambda x:x[0], words) plt.barh(range(len(count)), count, color="green") plt.yticks(range(len(count)), word) # In[10]: words = words.map(lambda x:(x[1], x[0])) # In[11]: words.take(15) # In[12]: histogram(words.take(15))