去哪儿爬虫
```# -*- ecoding: utf-8 -*-
# @ModuleName: 去哪儿爬虫
# @Function:
# @Author: xiaochen
# @Time: 2021/2/18 11:07
import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class Qner(object):
def __init__(self):
self.ua = UserAgent()
self.headers = {'User-Agent': self.ua.random}
self.url = 'https://piao.qunar.com/ticket/list.htm?keyword='
self.city = city
self.pagemax = int()
self.hrefs = []
def get_pagemax(self):
url = f'{self.url}{self.city}'
res = requests.get(url,headers =self.headers)
if res.status_code == 200:
soup = BeautifulSoup(res.text,'html.parser')
a = soup.find('div',class_='pager').find_all('a')
pagemax = a[-2].get_text()
self.pagemax = int(pagemax)
def get_urllist(self):
for i in range(1,self.pagemax + 1):
url = f'{self.url}{city}&page={i}'
#print(url)
res = requests.get(url, headers=self.headers)
time.sleep(2)
if res.status_code == 200:
soup = BeautifulSoup(res.text,'html.parser')
divs = soup.find_all('div', class_="sight_item_detail clrfix")
for div in divs:
name = div.find('a',class_='name').get_text()
print(name)
address = div.find('p', class_="address color999").find('span').get_text()
print(address)
try:
price = div.find('span', class_="sight_item_price").find('em').get_text()
print(price)
except:
print("价格不详!")
href = div.find('h3', class_='sight_item_caption').find('a')['href']
href = f'https://piao.qunar.com{href}'
self.hrefs.append(href)
#print(self.hrefs)
time.sleep(5)
if __name__ == '__main__':
city="北京"
spider=Qner()
spider.get_pagemax()
spider.get_urllist()
#BeautifulSoup使用过程中以前的“lxml”,在新版本需要统一切换为“html.parser”
```
页:
[1]