根據暱稱爬取id的數據預處理以及各式轉換java
#!/bin/bash root_dir=`pwd` out_all_file="$root_dir"/result_data/user.all out_map="$root_dir"/result_data/name_id.map rm -rf $out_all_file rm -rf $out_map #######put the user.out in the dictory $root_dir/source_data/####### ####processing the jar################################# #java -cp "$root_dir"/src/weiboApi.jar com.bobo.parser.ParseUserInfo $root_dir echo "java processing is done" #########data output after translate######## ##screenName/id/gender/location/status_count/friendscount/followerscount/verfied##### dos2unix "$root_dir"/source_data/data.dealed dos2unix "$root_dir"/source_data/user.out join -a 1 -t $'\t' -e "null" -o 1.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.2 1.3 "$root_dir"/source_data/data.dealed "$root_dir"/source_data/user.out>$out_all_file echo "generate all_info is done" cut $out_all_file -f 1,2>$out_map echo "generate name_id.map is done"
#!/bin/bash tmp_dir=`pwd` root_dir=$tmp_dir"/人氣榜" first_dirs=`ls $root_dir` category_tmpfile=$tmp_dir"/category.tmp" data_tmpfile=$tmp_dir"/data.tmp" data_file=$tmp_dir"/data.dealed" rm -rf $category_tmpfile rm -rf $data_tmpfile rm -rf $data_file for first_dir in $first_dirs do second_dirs=`ls $root_dir"/"$first_dir | awk -F'.' '{print $1}'` for second_dir in $second_dirs do cat $root_dir"/"$first_dir"/"$second_dir".txt" >>$data_tmpfile line_count=`wc -l $root_dir"/"$first_dir"/"$second_dir".txt" | awk -F' ' '{print $1}'` for nu in $(seq 1 $line_count) do echo $first_dir" "$second_dir>>$category_tmpfile done done done #######處理前須要注意去除前一個文件在系統下的換行符號^M########### dos2unix $category_tmpfile dos2unix $data_tmpfile paste -d "\t" $data_tmpfile $category_tmpfile>$data_file
1 #!/bin/sh 2 cur_dir=`pwd` 3 source_dir="/data/beiyou/minelab/fans_count_list" 4 result_file=$cur_dir/result.data 5 source_files="$source_dir/0.data $source_dir/1.data $source_dir/2.data $source_dir/3.data $source_dir/4.data $source_dir/5.data" 6 for file in $source_files 7 do 8 # echo $file 9 count=`cat $file|cut -f 3| awk '{for(i=1;i<=NF;i++) print $i}' | sort | uniq | wc -l` 10 echo $count 11 echo $count>>$result_file 12 echo "$file is done" 13 done
#!/bin/bash root_dir=/data/beiyou/minelab #date=`date -d "0 day ago" +%Y%m%d` date=20140119 # the taw data dir source_dir=$root_dir/source_data/Video/$date # the predata(extract two columns) dir pre_dir=$root_dir/Src/liweibo/source_data/video/$date # the segment and remove stopwords dir seg_dir=$root_dir/Src/liweibo/result_data/video/$date # the total word_count dir word_count_dir=$root_dir/Src/liweibo/result_data/videoWordCount/$date # the yinhang dir yinhang_dir=$root_dir/Src/yinhang # the final dir , top n word of every program outfile_final_dir=$root_dir/VideoResult/$date rm -rf $pre_dir rm -rf $seg_dir rm -rf $word_count_dir rm -rf $outfile_final_dir mkdir $pre_dir mkdir $seg_dir mkdir $word_count_dir mkdir $outfile_final_dir dir_list=`ls $source_dir` for dir in $dir_list do file_list=`ls $source_dir/$dir` for file_name in $file_list do if [ "$file_name" == "aiqiyi.retain" ] || [ "$file_name" == "youku.retain" ] || [ "$file_name" == "souhu.retain" ]; then cat $source_dir/$dir/$file_name | awk -F'\t' '{print $2"\t"$4}' > $pre_dir/$file_name.pre java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/ echo "$file_name datapre and segment is done!" elif [ "$file_name" == "pptv.retain" ];then cat $source_dir/$dir/$file_name | awk -F'\t' '{print $2"\t"$3}' > $pre_dir/$file_name.pre java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/ echo "$file_name datapre and segment is done!" fi done done echo "begin to calculate word count..." program_list=`cat $root_dir/Src/liweibo/conf/program.list` seg_file_list=`ls $seg_dir` for seg_file in $seg_file_list do for program in $program_list do cat $seg_dir/$seg_file | grep $program | awk -F'\t' '{print $2}' | awk '{ for(i=1;i<=NF;i++) print $i }' | sort | uniq -c | sort -r -n >> $word_count_dir/$program.tf done done echo "begin to generate top n..." head -n 20 $word_count_dir/*.tf >> $outfile_final_dir/videoWordTf.topn
#!/bin/bash root_dir=/data/beiyou/minelab #date=`date -d "0 day ago" +%Y%m%d` date=20140119 # the taw data dir source_dir=$root_dir/source_data/Video/$date # the predata(extract two columns) dir pre_dir=$root_dir/Src/liweibo/source_data/video/$date # the segment and remove stopwords dir seg_dir=$root_dir/Src/liweibo/result_data/video/$date # the total word_count dir word_count_dir=$root_dir/Src/liweibo/result_data/videoWordCount/$date # the yinhang dir yinhang_dir=$root_dir/Src/yinhang # the final dir , top n word of every program outfile_final_dir=$root_dir/VideoResult/$date ########the top num of word##### if [ "$#" != 1 ];then echo "error parameters!" exit 1 fi topN=$1 rm -rf $pre_dir rm -rf $seg_dir rm -rf $word_count_dir rm -rf $outfile_final_dir mkdir $pre_dir mkdir $seg_dir mkdir $word_count_dir mkdir $outfile_final_dir dir_list=`ls $source_dir` for dir in $dir_list do file_list=`ls $source_dir/$dir` for file_name in $file_list do if [ "$file_name" == "aiqiyi.retain" ] || [ "$file_name" == "youku.retain" ] || [ "$file_name" == "souhu.retain" ]; then cat $source_dir/$dir/$file_name | awk -F'\t' '{print $2"\t"$4}' > $pre_dir/$file_name.pre java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/ echo "$file_name datapre and segment is done!" elif [ "$file_name" == "pptv.retain" ];then cat $source_dir/$dir/$file_name | awk -F'\t' '{print $2"\t"$3}' > $pre_dir/$file_name.pre java -cp $yinhang_dir/forLiWeiBo.jar SinaPre $pre_dir/$file_name.pre $seg_dir/$file_name.seg $root_dir/source_data/Common/stopwords.list $yinhang_dir/bin/ echo "$file_name datapre and segment is done!" fi done done
vi的一些技巧:數組
去除空行bash
:1,$g/^$/dide