import os import numpy as np import pandas as pd from tqdm import tqdm import numpy as np import string import nltk from nltk.tokenize import word_tokenize from textblob import TextBlob import pdb max_phrase_length = 5 basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/' path = basicPath files = os.listdir(path) print(path) word_base_path = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt' wordBase = open(word_base_path, 'r') wordList = [] lines = wordBase.readlines() for line in lines: line_ = line.rstrip('\n').rstrip('.') # pdb.set_trace() wordList.append(line_) for i in range(len(files)): videoName = files[i] print videoName langPath = path + videoName + '/language.txt' ## for other datset # langPath = path + videoName + '/' + videoName+'.txt' f = open(langPath, 'r') language = f.readline() words = word_tokenize(language) token_results = nltk.pos_tag(words) blob = TextBlob(language) print blob.noun_phrases langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt' f_phrase = open(langPath_Phrase, 'w') langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt' f_phrase_Idx = open(langPath_PhraseIndex, 'w') # pdb.set_trace() for j in range(len(blob.noun_phrases)): phrase = blob.noun_phrases[j] f_phrase.write(phrase) f_phrase.write('\n') written_num = 0 if len(phrase) > 1: word_ = word_tokenize(phrase) for phraseIndex in range(len(word_)): wordINDEX = wordList.index(word_[phraseIndex]) f_phrase_Idx.write(str(wordINDEX)) f_phrase_Idx.write(',') written_num = written_num + 1 if written_num < max_phrase_length: diff_num = max_phrase_length - written_num for k in range(diff_num): f_phrase_Idx.write('0') f_phrase_Idx.write(',') f_phrase_Idx.write('\n')