Pythpn 中 简单的爬虫

  • 基础环境的搭建
  • pip install requests, pip install pandas pip install bs4 ,获取请求头信息 http://httpbin.org/get
  1. 获取数据
URL = "http://book.douban.com/latest"

#请求数据
def get_data():
    url = URL
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15"}
    data = requests.get(url,headers=headers)
    # print(data.text)
    return  data
  1. 解析数据
def parse_data(data):
    soup = BeautifulSoup(data.text,'lxml')
    # print(soup)
    books_left = soup.find('ul',{'class':'cover-col-4 clearfix'})
    books_left = books_left.find_all('li')
    books_right = soup.find('ul',{'class':'cover-col-4 pl20 clearfix'})
    books_right = books_right.find_all('li')
    books = list(books_left) + list(books_right)

    img_urls = []
    titles = []
    ratings = []
    authors = []
    details = []
    for book in books:
        img_url = book.find_all('a')[0].find('img').get('src')
        img_urls.append(img_url)
        title = book.find_all('a')[1].get_text()
        titles.append(title)
        print(title)

        #评价星级
        rating = book.find('p',{'class':'rating'}).get_text()
        rating = rating.replace('\n','').replace(' ','')
        ratings.append(rating)


        #作者
        author = book.find('p',{'class':'color-gray'}).get_text()
        author = author.replace('\n','').replace(' ','')
        authors.append(author)

        #图书简介
        detail = book.find_all('p')[2].get_text()
        detail = detail.replace('\n','').replace(' ','')
        details.append(detail)


    print('img_urls:',img_urls)

    print('titles:',titles)

    print('ratings:',ratings)

    print('authors',authors)

    print('details:',details)

    return img_urls,titles,ratings,authors,details
  1. 存储数据
#存储数据
def save_data(imgurls,titles,ratings,authors,details):
    result = pd.DataFrame()
    result['img_urls'] = imgurls
    result['titles'] = titles
    result['ratings'] = ratings
    result['authors'] = authors
    result['details'] = details
    result.to_csv('result.csv',index=None)

评论