defparse_page(content): tree = etree.HTML(content) # 电影排名 ranking = tree.xpath("//dd/i/text()") # 电影名称 movie_name = tree.xpath('//p[@class="name"]/a/text()') # 主演 performer = tree.xpath("//p[@class='star']/text()") performer = [p.strip() for p in performer] # 上映时间 releasetime = tree.xpath('//p[@class="releasetime"]/text()') # 评分 score1 = tree.xpath('//p[@class="score"]/i[@class="integer"]/text()') score2 = tree.xpath('//p[@class="score"]/i[@class="fraction"]/text()') score = [score1[i] + score2[i] for i in range(min(len(score1), len(score2)))] # 电影封面图 movie_img = tree.xpath('//img[@class="board-img"]/@data-src') return zip(ranking, movie_name, performer, releasetime, score, movie_img)
defsave_results(result): with open('maoyan.csv', 'a') as fp: writer = csv.writer(fp) writer.writerow(result)
if __name__ == '__main__': print('开始爬取数据...') for i in range(0, 100, 10): index = index_page(i) results = parse_page(index) for i in results: save_results(i) print('数据爬取完毕!')