sudo pip3 install beautifulsoup4
from bs4 import BeautifulSoup
html_atag = """<html><body><p>Test html a tag example</p> <a href="http://www. allitebook.com">Home</a> <a href="http://www.allitebook.com/books">Books</a> </body> </html>""" soup = BeautifulSoup(html_atag, "html5lib") print(soup.a)
#input html_markup = """<div> <ul id="students"> <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> <li class="student"> <div class="name">Lucy</div> <div class="age">25</div> </li> </ul> </div>""" student_entries = soup.find("ul") print(student_entries) #output <ul id="students"> <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> <li class="student"> <div class="name">Lucy</div> <div class="age">25</div> </li> </ul>
找到ul節點後,經過觀察html能夠得知,ul下有2個li,每一個li下有2個div,則經過student_entries.li能夠獲取第一個li節點的數據,繼續經過student_entries.li.div能夠獲取第一個li下第一個div的數據,例如:html
#input print(student_entries.li) #output <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> #input print(student_entries.li.div) #output <div class="name">Carl</div>
#input print(student_entries.li.div.string) #output 'Carl'
#input import re email_id_example ="""<div>The below HTML has the information that has email ids.</div> abc@example.com <div>xyz@example.com</div> <span>foo@example.com</span>""" soup = BeautifulSoup(email_id_example,"lxml") emailid_regexp = re.compile("\w+@\w+\.\w+") first_email_id = soup.find(text=emailid_regexp) print(first_email_id) #output abc@example.com
#input all_email_id = soup.find_all(text=emailid_regexp) print(all_email_id) #output ['abc@example.com', 'xyz@example.com', 'foo@example.com']
#input print(first_student) #output <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> #input all_students = first_student.find_parent('ul') print(all_students) #output <ul id="students"> <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> <li class="student"> <div class="name">Lucy</div> <div class="age">25</div> </li> </ul>
#input second_student = first_student.find_next_sibling() print(second_student) #output <li class="student"> <div class="name">Lucy</div> <div class="age">25</div> </li>
#input print(first_student) #output <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> #input name = first_student.div print(name) #output <div class="name">Carl</div>
#input print(name.parent) #output <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li>
#input print(first_student.next_sibling) #output <li class="student"> <div class="name">Lucy</div> <div class="age">25</div> </li>
#input first_student #output <li class="student"> <div class="name">Carl</div> <div class="age">32</div> </li> #input first_student.name #output 'li' #input first_student.name = 'div' first_student.name #output 'div' #input first_student #output <div class="student"> <div class="name">Carl</div> <div class="age">32</div> </div>
#input first_student['class'] = 'student_new' print(first_student) #output <div class="student_new"> <div class="name">Carl</div> <div class="age">32</div> </div>
#input del first_student['class'] print(first_student) #output <div> <div class="name">Carl</div> <div class="age">32</div> </div>
#input print(first_student.div.string) #output Carl #input first_student.div.string = 'carl_new' print(first_student.div.string) #output carl_new
#input print(first_student) #output <li class="student"> <div class="name">carl_new</div> <div class="age">32</div> </li> #input first_student.div.decompose() print(first_student) #output <li class="student"> <div class="age">32</div> </li>