一、爬取校园新闻 import requests from bs4 import BeautifulSoup url = requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/")url.encoding = "utf-8"soup = BeautifulSoup(url.text,'html.parser')#print(soup.head.title.text) for news in soup.select('li'): if len(news.select('.news-list-title'))>0: #print(news.select('.news-list-title')) #print(news.select('.news-list-title')[0]) #print(news.select('.news-list-title')[0].text) time = news.select('.news-list-info')[0].contents[0].text title = news.select('.news-list-title')[0].text href = news.select('a')[0]['href'] href_text = requests.get(href) href_text.encoding = "utf-8" href_soup = BeautifulSoup(href_text.text,'html.parser') href_text_body = href_soup.select('.show-content')[0].text print(time,title,href,href_text_body)
二、爬取自己兴趣的网页import requestsfrom bs4 import BeautifulSoupjq='http://www.gamersky.com/pcgame/'res = requests.get(jq)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')for news in soup.select('li'): if len(news.select('a'))>0: title=news.select('a')[0].text url=news.select('a')[0]['href'] #time=news.select('span')[0].contents[0].text #print(time,title,url) print(title,url)