Python 中文分句 - CSDN博客 https://blog.csdn.net/laoyaotask/article/details/9260263html
# 設置分句的標誌符號;能夠根據實際須要進行修改 #cutlist = "。!?".decode('utf-8') cutlist = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、',':'] cutlist = [ '。', ';', '?', '.', ';', '?', '...', '、、、',':',':',','] # 檢查某字符是否分句標誌符號的函數;若是是,返回True,不然返回False def FindToken(cutlist, char): if char in cutlist: return True else: return False # 進行分句的核心函數 def Cut(cutlist, lines): # 參數1:引用分句標誌符;參數2:被分句的文本,爲一行中文字符 l = [] # 句子列表,用於存儲單個分句成功後的整句內容,爲函數的返回值 line = [] # 臨時列表,用於存儲捕獲到分句標誌符以前的每一個字符,一旦發現分句符號後,就會將其內容所有賦給l,而後就會被清空 for i in lines: # 對函數參數2中的每一字符逐個進行檢查 (本函數中,若是將if和else對換一下位置,會更好懂) if FindToken(cutlist, i): # 若是當前字符是分句符號 line.append(i) # 將此字符放入臨時列表中 l.append(''.join(line)) # 並把當前臨時列表的內容加入到句子列表中 line = [] # 將符號列表清空,以便下次分句使用 else: # 若是當前字符不是分句符號,則將該字符直接放入臨時列表中 line.append(i) return l r_s=[] # 如下爲調用上述函數實現從文本文件中讀取內容並進行分句。 with open('tmp.txt','r',encoding='utf-8') as fr : for lines in fr: l = Cut(list(cutlist), list(lines)) for line in l: if line.strip() != "": line=line.strip() r_s.append(line) # li = line.strip().split() # for sentence in li: # r_s.append(sentence) dd=9
雷鋒網AI科技評論按:去年7月20日,國務院正式印發《新一代人工智能發展規劃》的通知,《規劃》中指出,接下來人工智能將成爲國家重要發展戰略之一,其意義影響到國家的國際競爭力、經濟發展、社會建設等等大方向。 爲了落實《新一代人工智能發展規劃》,人才培養是關鍵。教育部在近日正式發佈了《高等學校人工智能創新行動計劃》。 教育部在《高等學校人工智能創新行動計劃》中強調,要增強人工智能領域專業建設,推動「新工科」建設,造成「人工智能+X」複合專業培養新模式。到 2020 年建設 100 個「人工智能+X」複合特點專業,推進重要方向的教材和在線開放課程建設。到 2020 年編寫 50 本具備國際一流水平的本科生和研究生教材、建設 50 門人工智能領域國家級精品在線開放課程、創建 50 家人工智能學院、研究院或交叉研究中心,並引導高校經過增量支持和存量調整,加大人工智能領域人才培養力度。在職業院校大數據、信息管理相關專業中增長人工智能相關內容,培養人工智能應用領域技術技能人才。 此外,教育部還列出了三個階段性目標: 到 2020 年,基本完成適應新一代人工智能發展的高校科技創新體系和學科體系的優化佈局,高校在新一代人工智能基礎理論和關鍵技術研究等方面取得新突破,人才培養和科學研究的優點進一步提高,並推進人工智能技術普遍應用。 到 2025 年,高校在新一代人工智能領域科技創新能力和人才培養質量顯著提高,取得一批具備國際重要影響的原創成果,部分理論研究、創新技術與應用示範達到世界領先水平,有效支撐我國產業升級、經濟轉型和智能社會建設。 到 2030 年,高校成爲建設世界主要人工智能創新中心的核心力量和引領新一代人工智能發展的人才高地,爲我國躋身創新型國家前列提供科技支撐和人才保障。 如下是《高等學校人工智能創新行動計劃》全文: 帶有兩個文本字段和一個提交按鈕的 HTML 表單: <form action="form_action.asp" method="get"> <p>Name: <input type="text" name="fullname" /></p> <p>Email: <input type="text" name="email" /></p> <input type="submit" value="Submit" /> </form> 親自試一試 定義和用法 name 屬性規定 input 元素的名稱。 name 屬性用於對提交到服務器後的表單數據進行標識,或者在客戶端經過 JavaScript 引用表單數據。 註釋:只有設置了 name 屬性的表單元素才能在提交表單時傳遞它們的值。
# 設置分句的標誌符號;能夠根據實際須要進行修改 #cutlist = "。!?".decode('utf-8') cutlist = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、',':'] cutlist = [ '。', ';', '?', '.', ';', '?', '...', '、、、',':',':',','] # 檢查某字符是否分句標誌符號的函數;若是是,返回True,不然返回False def FindToken(cutlist, char): if char in cutlist: return True else: return False # 進行分句的核心函數 def Cut(cutlist, lines): # 參數1:引用分句標誌符;參數2:被分句的文本,爲一行中文字符 l = [] # 句子列表,用於存儲單個分句成功後的整句內容,爲函數的返回值 line = [] # 臨時列表,用於存儲捕獲到分句標誌符以前的每一個字符,一旦發現分句符號後,就會將其內容所有賦給l,而後就會被清空 for i in lines: # 對函數參數2中的每一字符逐個進行檢查 (本函數中,若是將if和else對換一下位置,會更好懂) if FindToken(cutlist, i): # 若是當前字符是分句符號 line.append(i) # 將此字符放入臨時列表中 l.append(''.join(line)) # 並把當前臨時列表的內容加入到句子列表中 line = [] # 將符號列表清空,以便下次分句使用 else: # 若是當前字符不是分句符號,則將該字符直接放入臨時列表中 line.append(i) return l r_s=[] # 如下爲調用上述函數實現從文本文件中讀取內容並進行分句。 with open('tmp.txt','r',encoding='utf-8') as fr : for lines in fr: l = Cut(list(cutlist), list(lines)) for line in l: if line.strip() != "": line=line.strip() r_s.append(line) # li = line.strip().split() # for sentence in li: # r_s.append(sentence) str_='' # cutlist = [ '。', ';', '?', '.', ';', '?', '...', '、、、',':',':',',','\n'] with open('tmp.txt','r',encoding='utf-8') as fr : for lines in fr: str_='{}{}'.format(str_,lines.replace('\n','')) # l = Cut(list(cutlist), list(lines)) # for line in l: # if line.strip() != "": # line=line.strip() # r_s.append(line) dd=9
數據庫html 數據的分句 python
''' SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2; select top y * from 表 where 主鍵 not in(select top (x-1)*y 主鍵 from 表) 若是表中無主鍵,能夠用臨時表,加標識字段解決.這裏的x,y能夠用變量. select id=identity(int,1,1),* into #tb from 表 select * from #tb where id between (x-1)*y and x*y-1 select top 1000 Info_ID from Info_Roles select top 2000 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where Info_ID not in( select top 1000 Info_ID from Info_Roles ) ; select top 399 Info_ID,',xiaole20180410SPLIT,',UPPER(content) from Info_Content ; select top 399 CHARINDEX('IMG',UPPER(content)) from Info_Content ; select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where CHARINDEX('IMG',UPPER(content))>0; select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where Info_ID in( select top 1000 Info_ID from Info_Roles ) and CHARINDEX('IMG',UPPER(content))>0 ; SELECT TOP 15 Info_ID, ',xiaole20180410SPLIT,', content FROM Info_Content WHERE Info_ID IN ( SELECT TOP 1000 Info_ID FROM Info_Roles WHERE Flag = 1 ) AND CHARINDEX('IMG', UPPER(content)) > 0; SELECT TOP 200 Info_ID, ',xiaole20180410SPLIT,', content FROM Info_Content WHERE Info_ID IN ( SELECT TOP 90000 Info_ID FROM Info_Roles ) AND CHARINDEX('<IMG', UPPER(content)) > 0; ''' from bs4 import BeautifulSoup from selenium import webdriver xlsplit_str = ',xiaole20180410SPLIT,' f_db_txt, uid_d = 'db.uid.para.txt', {} with open(f_db_txt, 'r', encoding='utf-8') as fr: for i in fr: i = i.replace('\t', '').replace('\n', '') if xlsplit_str in i: l = i.split(xlsplit_str) uid = l[0].replace(' ', '') uid_d[uid] = {} uid_d[uid]['html'] = [] uid_d[uid]['html'].append(l[1]) else: uid_d[uid]['html'].append(i) r_d = {} ''' 中文分句 ''' cutlist = [ '。', ';', '?', '.', ';', '?', '...', '、、、',':',':',',',','] # 檢查某字符是否分句標誌符號的函數;若是是,返回True,不然返回False def FindToken(cutlist, char): if char in cutlist: return True else: return False # 進行分句的核心函數 def Cut(cutlist, lines): # 參數1:引用分句標誌符;參數2:被分句的文本,爲一行中文字符 l = [] # 句子列表,用於存儲單個分句成功後的整句內容,爲函數的返回值 line = [] # 臨時列表,用於存儲捕獲到分句標誌符以前的每一個字符,一旦發現分句符號後,就會將其內容所有賦給l,而後就會被清空 for i in lines: # 對函數參數2中的每一字符逐個進行檢查 (本函數中,若是將if和else對換一下位置,會更好懂) if FindToken(cutlist, i): # 若是當前字符是分句符號 line.append(i) # 將此字符放入臨時列表中 l.append(''.join(line)) # 並把當前臨時列表的內容加入到句子列表中 line = [] # 將符號列表清空,以便下次分句使用 else: # 若是當前字符不是分句符號,則將該字符直接放入臨時列表中 line.append(i) return l ''' ''' def paragraph_to_sentence(paragraph, sentence_l): paragraph = paragraph.replace(' ', '') sentence_split_l = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break return sentence_l def paragraph_to_sentence_no_recursion(paragraph, sentence_l): paragraph = paragraph.replace(' ', '') sentence_split_l = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break return sentence_l paragraph='' sentence_l=[] paragraph = paragraph.replace(' ', '') sentence_split_l = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break def sentence_l_to_sentence_l_l(sentence_l): sentence_l_l = [] sentence_split_l = ['\n', '\t', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_l: for ii in sentence_split_l: ll = i.split(ii) if len(ll) > 1: sentence_l_l += ll else: sentence_l_l.append(i) continue return sentence_l_l import requests, time, threading img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\' img_dir = 'C:\\Users\\sas\\PycharmProjects\\produce_video\\mypng\\' # http://www.lky365.com/editor/uploadfile/20090508144220411.jpg # C:\Users\sas\PycharmProjects\produce_video\mypng def spider_webimg_dl_return_local_img_path(img_dir, img_url, uid, local_default='default.DONOT_REMOVE.png'): r = '%s%s' % (img_dir, local_default) try: bytes = requests.get(img_url)._content # r = '{}{}{}{}{}'.format(img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), 'g3dbuid', uid, '.png') r = '{}{}{}{}{}{}'.format(img_dir, 'g3db', uid, 'g3uid', img_url.split('.')[0].split('/')[-1], '.png') # if bytes != 0: if bytes != 0 and requests.get(img_url).status_code == 200: with open(r, 'wb')as f: f.write(bytes) else: print(img_url) except Exception as e: print(img_url, ',,,', uid) print(e) return r from aip import AipSpeech bd_k_l = ['11059852', '5Kk01GtG2fjCwpzEkwdn0mjw', 'bp6Wyx377Elq7RsCQZzTBgGUFzLm8G2A'] APP_ID, API_KEY, SECRET_KEY = bd_k_l f_p, str_ = 'mybaidu.parp.b.txt', '' with open(f_p, 'r', encoding='utf-8') as fr: for i in fr: ii = i.replace('\n', '') str_ = '{}{}'.format(str_, ii) def gen_bd_mp3(uid, str_): mp3_dir = 'C:\\Users\\sas\\PycharmProjects\\produce_video\\mymp3\\' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) result = client.synthesis(str_, 'zh', 1, { 'vol': 5, }) # 識別正確返回語音二進制 錯誤則返回dict 參照下面錯誤碼 if not isinstance(result, dict): # f_w = '{}{}{}{}'.format(mp3_dir, 'g3uid', uid, '.mp3') f_w = '{}{}{}{}{}'.format(mp3_dir, 'g3db', uid, 'g3uid', '.mp3') # ,'g3db',uid,'g3uid' # with open('auido.b.mp3', 'wb') as f: with open(f_w, 'wb') as f: f.write(result) for uid in uid_d: str_ = ''.join(uid_d[uid]['html']) fhtml = 'qqzong.vedio.allinone.tmp.html' with open(fhtml, 'w', encoding='utf-8') as fw: fw.write(str_) with open(fhtml, 'r', encoding='utf-8') as fo: soup = BeautifulSoup(fo, 'html.parser') img_l = soup.find_all('img') if len(img_l) > 0: l = soup.find_all('img') uid_d[uid]['img'], uid_d[uid]['txt'] = [i.attrs['src'] for i in l], soup.text r_d[uid] = {} r_d[uid] = uid_d[uid] # incr_l = ['http://www.51g3.net/templates/images/logo.jpg', # 'http://www.51g3.net/attached/image/20171206104541_20247.jpg', # 'http://www.51g3.net/attached/image/20171129183441_78749.png', # 'http://www.51g3.net/templates/images/agentimg.jpg'] incr_l = [] r_d[uid]['img'] += incr_l # r_d[uid]['sentence_l']=paragraph_to_sentence(uid_d[uid]['txt'],[]) sentence_l = paragraph_to_sentence(uid_d[uid]['txt'], []) try: str_ = uid_d[uid]['txt'] # gen_bd_mp3(uid, str_) except Exception as e: print(e) for img_url in r_d[uid]['img']: # spider_webimg_dl_return_local_img_path(img_dir, img_url, uid, local_default='default.DONOT_REMOVE.png') pass # r_d[uid]['sentence_l'] = sentence_l_to_sentence_l_l(sentence_l) r_d[uid]['sentence_l'] = Cut(list(cutlist), list(uid_d[uid]['txt'])) else: # print(uid) pass uid_l = [i for i in r_d] import os import os, time, glob import cv2 os_sep = os.sep this_file_abspath = os.path.abspath(__file__) this_file_dirname, this_file_name = os.path.dirname(this_file_abspath), os.path.abspath(__file__).split(os_sep)[ -1] f_img_d = '{}{}{}{}{}'.format(this_file_dirname, os_sep, 'mypng', os_sep, '*.png') f_mp3_d = '{}{}{}{}{}'.format(this_file_dirname, os_sep, 'mymp3', os_sep, '*.mp3') imgs, img_size_d = glob.glob(f_img_d), {} mp3s, mp3_size_d = glob.glob(f_mp3_d), {} for uid in r_d: chk_str = '{}{}{}'.format('g3db', uid, 'g3uid') r_d[uid]['img_n'],r_d[uid]['img_path'] = 0,[] for img in imgs: if chk_str in img: r_d[uid]['img_n'] += 1 r_d[uid]['img_path'].append(img) else: pass for mp3 in mp3s: if chk_str in mp3: r_d[uid]['mp3_path']=mp3 else: pass print('-----------------') ''' >2 15796 16010 16065 16577 >1 15796 16010 16065 16577 16635 17923 >=1 15706 15766 15791 15796 16010 16065 16159 16509 16577 16635 16895 16915 16919 17206 17240 17622 17642 17923 18112 18207 18237 18239 18438 18701 18909 18934 18935 18937 18996 19135 19323 19589 19590 19592 ''' uid_r_l=[] for uid in r_d: if int(r_d[uid]['img_n'])>=1: print(uid) dddd = 9