https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARNpython
leewyang編輯本頁 20 days ago · 7修訂git
本地克隆此wikigithub
在桌面克隆bootstrap
在開始以前,您應該已經熟悉TensorFlow而且能夠訪問安裝了Spark的Hadoop網格。若是你的網格有GPU節點,他們必須在本地安裝cuda。windows
從網格網關,下載/安裝Python到本地文件夾。Python的這種安裝將分發給Spark執行器,以便任何自定義依賴關係,包括TensorFlow,均可以被執行器使用。curl
# download and extract Python 2.7 export PYTHON_ROOT=~/Python curl -O https://www.python.org/ftp/python/2.7.12/Python-2.7.12.tgz tar -xvf Python-2.7.12.tgz rm Python-2.7.12.tgz # compile into local PYTHON_ROOT pushd Python-2.7.12 ./configure --prefix="${PYTHON_ROOT}" --enable-unicode=ucs4 make make install popd rm -rf Python-2.7.12 # install pip pushd "${PYTHON_ROOT}" curl -O https://bootstrap.pypa.io/get-pip.py bin/python get-pip.py rm get-pip.py # install tensorflow (and any custom dependencies) ${PYTHON_ROOT}/bin/pip install pydoop # Note: add any extra dependencies here popd
git clone git@github.com:yahoo/tensorflow.git # follow build instructions to install into ${PYTHON_ROOT}
git clone https://github.com/tensorflow/ecosystem.git # follow build instructions to generate tensorflow-hadoop-1.0-SNAPSHOT.jar # copy jar to HDFS for easier reference hadoop fs -put tensorflow-hadoop-1.0-SNAPSHOT.jar
pushd "${PYTHON_ROOT}" zip -r Python.zip * popd # copy this Python distribution into HDFS hadoop fs -put ${PYTHON_ROOT}/Python.zip
接下來,克隆這個repo併爲Spark構建一個zip包:分佈式
git clone git@github.com:yahoo/TensorFlowOnSpark.git pushd TensorFlowOnSpark/src zip -r ../tfspark.zip * popd
mkdir ${HOME}/mnist pushd ${HOME}/mnist >/dev/null curl -O "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz" curl -O "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz" curl -O "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz" curl -O "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz" popd >/dev/null
# set environment variables (if not already done) export PYTHON_ROOT=~/Python export LD_LIBRARY_PATH=${PATH} export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=Python/bin/python" export PATH=${PYTHON_ROOT}/bin/:$PATH export QUEUE=gpu # for CPU mode: # export QUEUE=default # remove --conf spark.executorEnv.LD_LIBRARY_PATH \ # remove --driver-library-path \ # save images and labels as CSV files ${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 4G \ --archives hdfs:///user/${USER}/Python.zip#Python,mnist/mnist.zip#mnist \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ TensorFlowOnSpark/examples/mnist/mnist_data_setup.py \ --output mnist/csv \ --format csv # save images and labels as TFRecords ${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 4G \ --archives hdfs:///user/${USER}/Python.zip#Python,mnist/mnist.zip#mnist \ --jars hdfs:///user/${USER}/tensorflow-hadoop-1.0-SNAPSHOT.jar \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ TensorFlowOnSpark/examples/mnist/mnist_data_setup.py \ --output mnist/tfr \ --format tfr
# for CPU mode: # export QUEUE=default # set --conf spark.executorEnv.LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server" \ # remove --driver-library-path \ # for CDH (per @wangyum) # set "--conf spark.executorEnv.LD_LIBRARY_PATH="/opt/cloudera/parcels/CDH/lib64:$JAVA_HOME/jre/lib/amd64/server" # hadoop fs -rm -r mnist_model ${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 27G \ --py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/spark/mnist_dist.py \ --conf spark.dynamicAllocation.enabled=false \ --conf spark.yarn.maxAppAttempts=1 \ --archives hdfs:///user/${USER}/Python.zip#Python \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ TensorFlowOnSpark/examples/mnist/spark/mnist_spark.py \ --images mnist/csv/train/images \ --labels mnist/csv/train/labels \ --mode train \ --model mnist_model # to use infiniband, add --rdma
${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 27G \ --py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/spark/mnist_dist.py \ --conf spark.dynamicAllocation.enabled=false \ --conf spark.yarn.maxAppAttempts=1 \ --archives hdfs:///user/${USER}/Python.zip#Python \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ TensorFlowOnSpark/examples/mnist/spark/mnist_spark.py \ --images mnist/csv/test/images \ --labels mnist/csv/test/labels \ --mode inference \ --model mnist_model \ --output predictions
# for CPU mode: # export QUEUE=default # set --conf spark.executorEnv.LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server" \ # remove --driver-library-path \ # for CDH (per @wangyum) # set "--conf spark.executorEnv.LD_LIBRARY_PATH="/opt/cloudera/parcels/CDH/lib64:$JAVA_HOME/jre/lib/amd64/server" # hadoop fs -rm -r mnist_model ${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 27G \ --py-files tensorflow/tfspark.zip,tensorflow/examples/mnist/tf/mnist_dist.py \ --conf spark.dynamicAllocation.enabled=false \ --conf spark.yarn.maxAppAttempts=1 \ --archives hdfs:///user/${USER}/Python.zip#Python \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ tensorflow/examples/mnist/tf/mnist_spark.py \ --images mnist/tfr/train \ --format tfr \ --mode train \ --model mnist_model # to use infiniband, replace the last line with --model mnist_model --rdma
# hadoop fs -rm -r predictions ${SPARK_HOME}/bin/spark-submit \ --master yarn \ --deploy-mode cluster \ --queue ${QUEUE} \ --num-executors 4 \ --executor-memory 27G \ --py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist.py \ --conf spark.dynamicAllocation.enabled=false \ --conf spark.yarn.maxAppAttempts=1 \ --archives hdfs:///user/${USER}/Python.zip#Python \ --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ --driver-library-path="/usr/local/cuda-7.5/lib64" \ TensorFlowOnSpark/examples/mnist/tf/mnist_spark.py \ --images mnist/tfr/test \ --mode inference \ --model mnist_model \ --output predictions