數據庫html 數據的分句

時間 2019-11-18

標籤數據庫 html 數據分句欄目 SQL 简体版

原文原文鏈接

Python 中文分句 - CSDN博客 https://blog.csdn.net/laoyaotask/article/details/9260263html

# 設置分句的標誌符號；能夠根據實際須要進行修改
#cutlist = "。！？".decode('utf-8')

cutlist = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、','：']
cutlist = [ '。', '；', '？', '.', ';', '?', '...', '、、、','：',':','，']


# 檢查某字符是否分句標誌符號的函數；若是是，返回True，不然返回False
def FindToken(cutlist, char):
    if char in cutlist:
        return True
    else:
        return False


# 進行分句的核心函數
def Cut(cutlist, lines):  # 參數1：引用分句標誌符；參數2：被分句的文本，爲一行中文字符
    l = []  # 句子列表，用於存儲單個分句成功後的整句內容，爲函數的返回值
    line = []  # 臨時列表，用於存儲捕獲到分句標誌符以前的每一個字符，一旦發現分句符號後，就會將其內容所有賦給l，而後就會被清空

    for i in lines:  # 對函數參數2中的每一字符逐個進行檢查 （本函數中，若是將if和else對換一下位置，會更好懂）
        if FindToken(cutlist, i):  # 若是當前字符是分句符號
            line.append(i)  # 將此字符放入臨時列表中
            l.append(''.join(line))  # 並把當前臨時列表的內容加入到句子列表中
            line = []  # 將符號列表清空，以便下次分句使用
        else:  # 若是當前字符不是分句符號，則將該字符直接放入臨時列表中
            line.append(i)
    return l

r_s=[]
# 如下爲調用上述函數實現從文本文件中讀取內容並進行分句。
with open('tmp.txt','r',encoding='utf-8') as fr :
    for lines in fr:
        l = Cut(list(cutlist), list(lines))
        for line in l:
            if line.strip() != "":
                line=line.strip()
                r_s.append(line)

                # li = line.strip().split()
                # for sentence in li:
                #     r_s.append(sentence)




dd=9

　雷鋒網AI科技評論按：去年7月20日，國務院正式印發《新一代人工智能發展規劃》的通知，《規劃》中指出，接下來人工智能將成爲國家重要發展戰略之一，其意義影響到國家的國際競爭力、經濟發展、社會建設等等大方向。
　　爲了落實《新一代人工智能發展規劃》，人才培養是關鍵。教育部在近日正式發佈了《高等學校人工智能創新行動計劃》。
　　教育部在《高等學校人工智能創新行動計劃》中強調，要增強人工智能領域專業建設，推動「新工科」建設，造成「人工智能+X」複合專業培養新模式。到 2020 年建設 100 個「人工智能+X」複合特點專業，推進重要方向的教材和在線開放課程建設。到 2020 年編寫 50 本具備國際一流水平的本科生和研究生教材、建設 50 門人工智能領域國家級精品在線開放課程、創建 50 家人工智能學院、研究院或交叉研究中心，並引導高校經過增量支持和存量調整，加大人工智能領域人才培養力度。在職業院校大數據、信息管理相關專業中增長人工智能相關內容，培養人工智能應用領域技術技能人才。
　　此外，教育部還列出了三個階段性目標：
到 2020 年，基本完成適應新一代人工智能發展的高校科技創新體系和學科體系的優化佈局，高校在新一代人工智能基礎理論和關鍵技術研究等方面取得新突破，人才培養和科學研究的優點進一步提高，並推進人工智能技術普遍應用。
到 2025 年，高校在新一代人工智能領域科技創新能力和人才培養質量顯著提高，取得一批具備國際重要影響的原創成果，部分理論研究、創新技術與應用示範達到世界領先水平，有效支撐我國產業升級、經濟轉型和智能社會建設。
到 2030 年，高校成爲建設世界主要人工智能創新中心的核心力量和引領新一代人工智能發展的人才高地，爲我國躋身創新型國家前列提供科技支撐和人才保障。
　　如下是《高等學校人工智能創新行動計劃》全文：


帶有兩個文本字段和一個提交按鈕的 HTML 表單：
<form action="form_action.asp" method="get">
  <p>Name: <input type="text" name="fullname" /></p>
  <p>Email: <input type="text" name="email" /></p>
  <input type="submit" value="Submit" />
</form>
親自試一試
定義和用法
name 屬性規定 input 元素的名稱。
name 屬性用於對提交到服務器後的表單數據進行標識，或者在客戶端經過 JavaScript 引用表單數據。
註釋：只有設置了 name 屬性的表單元素才能在提交表單時傳遞它們的值。

# 設置分句的標誌符號；能夠根據實際須要進行修改
#cutlist = "。！？".decode('utf-8')

cutlist = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、','：']
cutlist = [ '。', '；', '？', '.', ';', '?', '...', '、、、','：',':','，']


# 檢查某字符是否分句標誌符號的函數；若是是，返回True，不然返回False
def FindToken(cutlist, char):
    if char in cutlist:
        return True
    else:
        return False


# 進行分句的核心函數
def Cut(cutlist, lines):  # 參數1：引用分句標誌符；參數2：被分句的文本，爲一行中文字符
    l = []  # 句子列表，用於存儲單個分句成功後的整句內容，爲函數的返回值
    line = []  # 臨時列表，用於存儲捕獲到分句標誌符以前的每一個字符，一旦發現分句符號後，就會將其內容所有賦給l，而後就會被清空

    for i in lines:  # 對函數參數2中的每一字符逐個進行檢查 （本函數中，若是將if和else對換一下位置，會更好懂）
        if FindToken(cutlist, i):  # 若是當前字符是分句符號
            line.append(i)  # 將此字符放入臨時列表中
            l.append(''.join(line))  # 並把當前臨時列表的內容加入到句子列表中
            line = []  # 將符號列表清空，以便下次分句使用
        else:  # 若是當前字符不是分句符號，則將該字符直接放入臨時列表中
            line.append(i)
    return l

r_s=[]
# 如下爲調用上述函數實現從文本文件中讀取內容並進行分句。
with open('tmp.txt','r',encoding='utf-8') as fr :
    for lines in fr:
        l = Cut(list(cutlist), list(lines))
        for line in l:
            if line.strip() != "":
                line=line.strip()
                r_s.append(line)

                # li = line.strip().split()
                # for sentence in li:
                #     r_s.append(sentence)
str_=''

# cutlist = [ '。', '；', '？', '.', ';', '?', '...', '、、、','：',':','，','\n']

with open('tmp.txt','r',encoding='utf-8') as fr :
    for lines in fr:
        str_='{}{}'.format(str_,lines.replace('\n',''))
        # l = Cut(list(cutlist), list(lines))
        # for line in l:
        #     if line.strip() != "":
        #         line=line.strip()
        #         r_s.append(line)


dd=9

數據庫html 數據的分句 python

'''
SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2;


 select   top   y   *   from   表   where   主鍵   not   in(select   top   (x-1)*y   主鍵   from   表)



  若是表中無主鍵,能夠用臨時表,加標識字段解決.這裏的x,y能夠用變量.

  select   id=identity(int,1,1),*     into   #tb   from   表
  select   *   from   #tb   where   id   between   (x-1)*y   and   x*y-1




 select   top   1000   Info_ID   from   Info_Roles
 select   top   2000   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where   Info_ID   not   in( select   top   1000   Info_ID   from   Info_Roles   )   ;
 select   top   399   Info_ID,',xiaole20180410SPLIT,',UPPER(content)   from   Info_Content      ;
 select   top   399   CHARINDEX('IMG',UPPER(content))   from   Info_Content      ;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where  CHARINDEX('IMG',UPPER(content))>0;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where
 Info_ID      in( select   top   1000   Info_ID   from   Info_Roles   )  and
  CHARINDEX('IMG',UPPER(content))>0
 ;



SELECT
	TOP 15 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 1000 Info_ID
		FROM
			Info_Roles
		WHERE
			Flag = 1
	)
AND CHARINDEX('IMG', UPPER(content)) > 0;





SELECT
	TOP 200 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 90000 Info_ID
		FROM
			Info_Roles
	)
AND CHARINDEX('<IMG', UPPER(content)) > 0;



'''

from bs4 import BeautifulSoup
from selenium import webdriver

xlsplit_str = ',xiaole20180410SPLIT,'
f_db_txt, uid_d = 'db.uid.para.txt', {}
with open(f_db_txt, 'r', encoding='utf-8') as fr:
    for i in fr:
        i = i.replace('\t', '').replace('\n', '')
        if xlsplit_str in i:
            l = i.split(xlsplit_str)
            uid = l[0].replace(' ', '')
            uid_d[uid] = {}
            uid_d[uid]['html'] = []
            uid_d[uid]['html'].append(l[1])
        else:
            uid_d[uid]['html'].append(i)

r_d = {}


'''
中文分句
'''
cutlist = [ '。', '；', '？', '.', ';', '?', '...', '、、、','：',':','，',',']


# 檢查某字符是否分句標誌符號的函數；若是是，返回True，不然返回False
def FindToken(cutlist, char):
    if char in cutlist:
        return True
    else:
        return False


# 進行分句的核心函數
def Cut(cutlist, lines):  # 參數1：引用分句標誌符；參數2：被分句的文本，爲一行中文字符
    l = []  # 句子列表，用於存儲單個分句成功後的整句內容，爲函數的返回值
    line = []  # 臨時列表，用於存儲捕獲到分句標誌符以前的每一個字符，一旦發現分句符號後，就會將其內容所有賦給l，而後就會被清空

    for i in lines:  # 對函數參數2中的每一字符逐個進行檢查 （本函數中，若是將if和else對換一下位置，會更好懂）
        if FindToken(cutlist, i):  # 若是當前字符是分句符號
            line.append(i)  # 將此字符放入臨時列表中
            l.append(''.join(line))  # 並把當前臨時列表的內容加入到句子列表中
            line = []  # 將符號列表清空，以便下次分句使用
        else:  # 若是當前字符不是分句符號，則將該字符直接放入臨時列表中
            line.append(i)
    return l






'''

'''
def paragraph_to_sentence(paragraph, sentence_l):
    paragraph = paragraph.replace(' ', '')
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_split_l:
        ll = paragraph.split(i)
        sentence_l.append(ll[0])
        if len(ll) > 1:
            paragraph_to_sentence(ll[1], sentence_l)
        else:
            break

    return sentence_l


def paragraph_to_sentence_no_recursion(paragraph, sentence_l):
    paragraph = paragraph.replace(' ', '')
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_split_l:
        ll = paragraph.split(i)
        sentence_l.append(ll[0])
        if len(ll) > 1:
            paragraph_to_sentence(ll[1], sentence_l)
        else:
            break

    return sentence_l


paragraph=''
sentence_l=[]
paragraph = paragraph.replace(' ', '')
sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
for i in sentence_split_l:
    ll = paragraph.split(i)
    sentence_l.append(ll[0])
    if len(ll) > 1:
        paragraph_to_sentence(ll[1], sentence_l)
    else:
        break


def sentence_l_to_sentence_l_l(sentence_l):
    sentence_l_l = []
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_l:
        for ii in sentence_split_l:
            ll = i.split(ii)
            if len(ll) > 1:
                sentence_l_l += ll
            else:
                sentence_l_l.append(i)
                continue

    return sentence_l_l


import requests, time, threading

img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\'
img_dir = 'C:\\Users\\sas\\PycharmProjects\\produce_video\\mypng\\'


# http://www.lky365.com/editor/uploadfile/20090508144220411.jpg

# C:\Users\sas\PycharmProjects\produce_video\mypng

def spider_webimg_dl_return_local_img_path(img_dir, img_url, uid, local_default='default.DONOT_REMOVE.png'):
    r = '%s%s' % (img_dir, local_default)
    try:
        bytes = requests.get(img_url)._content

        #   r = '{}{}{}{}{}'.format(img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), 'g3dbuid', uid, '.png')

        r = '{}{}{}{}{}{}'.format(img_dir, 'g3db', uid, 'g3uid', img_url.split('.')[0].split('/')[-1], '.png')
        # if bytes != 0:
        if bytes != 0 and requests.get(img_url).status_code == 200:
            with open(r, 'wb')as f:
                f.write(bytes)
        else:
            print(img_url)
    except Exception as e:
        print(img_url, ',,,', uid)
        print(e)
    return r


from aip import AipSpeech

bd_k_l = ['11059852', '5Kk01GtG2fjCwpzEkwdn0mjw', 'bp6Wyx377Elq7RsCQZzTBgGUFzLm8G2A']
APP_ID, API_KEY, SECRET_KEY = bd_k_l

f_p, str_ = 'mybaidu.parp.b.txt', ''
with open(f_p, 'r', encoding='utf-8') as fr:
    for i in fr:
        ii = i.replace('\n', '')
        str_ = '{}{}'.format(str_, ii)


def gen_bd_mp3(uid, str_):
    mp3_dir = 'C:\\Users\\sas\\PycharmProjects\\produce_video\\mymp3\\'
    client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
    result = client.synthesis(str_, 'zh', 1, {
        'vol': 5,
    })
    # 識別正確返回語音二進制 錯誤則返回dict 參照下面錯誤碼
    if not isinstance(result, dict):
        # f_w = '{}{}{}{}'.format(mp3_dir, 'g3uid', uid, '.mp3')
        f_w = '{}{}{}{}{}'.format(mp3_dir, 'g3db', uid, 'g3uid', '.mp3')
        # ,'g3db',uid,'g3uid'
        #  with open('auido.b.mp3', 'wb') as f:
        with open(f_w, 'wb') as f:
            f.write(result)


for uid in uid_d:
    str_ = ''.join(uid_d[uid]['html'])
    fhtml = 'qqzong.vedio.allinone.tmp.html'
    with open(fhtml, 'w', encoding='utf-8') as fw:
        fw.write(str_)
    with open(fhtml, 'r', encoding='utf-8') as fo:
        soup = BeautifulSoup(fo, 'html.parser')
        img_l = soup.find_all('img')
        if len(img_l) > 0:
            l = soup.find_all('img')
            uid_d[uid]['img'], uid_d[uid]['txt'] = [i.attrs['src'] for i in l], soup.text
            r_d[uid] = {}
            r_d[uid] = uid_d[uid]
            # incr_l = ['http://www.51g3.net/templates/images/logo.jpg',
            #           'http://www.51g3.net/attached/image/20171206104541_20247.jpg',
            #           'http://www.51g3.net/attached/image/20171129183441_78749.png',
            #           'http://www.51g3.net/templates/images/agentimg.jpg']
            incr_l = []
            r_d[uid]['img'] += incr_l
            #            r_d[uid]['sentence_l']=paragraph_to_sentence(uid_d[uid]['txt'],[])

            sentence_l = paragraph_to_sentence(uid_d[uid]['txt'], [])

            try:
                str_ = uid_d[uid]['txt']
                #  gen_bd_mp3(uid, str_)
            except Exception as e:
                print(e)
            for img_url in r_d[uid]['img']:
                #  spider_webimg_dl_return_local_img_path(img_dir, img_url, uid, local_default='default.DONOT_REMOVE.png')
                pass
           # r_d[uid]['sentence_l'] = sentence_l_to_sentence_l_l(sentence_l)
            r_d[uid]['sentence_l'] =  Cut(list(cutlist), list(uid_d[uid]['txt']))
        else:
          #  print(uid)
            pass

uid_l = [i for i in r_d]

import os

import os, time, glob
import cv2

os_sep = os.sep
this_file_abspath = os.path.abspath(__file__)
this_file_dirname, this_file_name = os.path.dirname(this_file_abspath), os.path.abspath(__file__).split(os_sep)[
    -1]

f_img_d = '{}{}{}{}{}'.format(this_file_dirname, os_sep, 'mypng', os_sep, '*.png')
f_mp3_d = '{}{}{}{}{}'.format(this_file_dirname, os_sep, 'mymp3', os_sep, '*.mp3')
imgs, img_size_d = glob.glob(f_img_d), {}
mp3s, mp3_size_d = glob.glob(f_mp3_d), {}

for uid in  r_d:
    chk_str = '{}{}{}'.format('g3db', uid, 'g3uid')
    r_d[uid]['img_n'],r_d[uid]['img_path'] = 0,[]
    for img in imgs:
        if chk_str in img:
            r_d[uid]['img_n'] += 1
            r_d[uid]['img_path'].append(img)
        else:
            pass

    for mp3 in mp3s:
        if chk_str in mp3:
            r_d[uid]['mp3_path']=mp3
        else:
            pass



print('-----------------')
'''
>2
15796
16010
16065
16577

>1
15796
16010
16065
16577
16635
17923

>=1
15706
15766
15791
15796
16010
16065
16159
16509
16577
16635
16895
16915
16919
17206
17240
17622
17642
17923
18112
18207
18237
18239
18438
18701
18909
18934
18935
18937
18996
19135
19323
19589
19590
19592


'''
uid_r_l=[]
for uid in r_d:
    if  int(r_d[uid]['img_n'])>=1:
        print(uid)



dddd = 9