继上次(爬取twitter数据)在github上寻找代码看不懂的后续尝试:
其中包含selenium登录&异步加载&xpath--由于twitter仅展现近一个周的数据,所以当前数据爬取也是不全面的,还需要继续调整代码。
from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html
# 获取页面内所有帖子的url
def get_posts(url):
"""
url:包含所有帖子的浏览页面
"""
wb = webdriver.Chrome()
wb.get(url)
time.sleep(3)
#处理网页加载
js = 'return action=document.body.scrollHeight'
height = wb.execute_script(js)
wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
t1 = int(time.time())
status = True
num = 0
post_list = []
while status:
t2 = int(time.time())
if t2 - t1 < 30:#一边翻页一边读取网页源码,由于twitter异步加载后翻页的源码不全,所以在翻页过程中获取网页源码,但是获取的数据需要进行去重处理
selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
for info in infos:
post = info.xpath("string(.)").strip()
post_list.append(post)
new_height = wb.execute_script(js)
if new_height > height:
time.sleep(1)
wb.execute_script(
'window.scrollTo(0, document.body.scrollHeight)')
height = new_height
t1 = int(time.time())
elif num < 3:
time.sleep(3)
num = num + 1
else: # 超时且重试后停止,到底页面底部
status = False
return post_list
url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
post_list = get_posts(url)
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 13 22:24:58 2021
@author: 18742
"""
from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html
#直接获得这个块,再使用string(.)
##获取页面中的数据
#def get_info2(wb):
# wb.implicitly_wait(10)
## post = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n']/div/span").text
# post = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#
# data = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
# post_list = []
# for i in range(len(data)):
# post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行
# print(post)
# post = str(post)
# post_list.append("".join(post))
#
# like = wb.find_element_by_xpath("//*/div[@data-testid='like']//div/span/span").text
# retweet = wb.find_element_by_xpath("//*/div[@data-testid='retweet']//div/span/span").text
# reply = wb.find_element_by_xpath("//*/div[@data-testid='reply']//div/span/span").text
# data = {
## "good":good,
# "post":post,
# "like":like,
# "retweet":retweet,
# "reply":reply}
# return data
#
#
#
## 获取页面内所有帖子的url
#def get_posts(url):
# """
# url:包含所有帖子的浏览页面
# """
# wb = webdriver.Chrome()
# wb.get(url)
# time.sleep(3)
#
#
# js = 'return action=document.body.scrollHeight'
# height = wb.execute_script(js)
# wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# time.sleep(5)
#
# t1 = int(time.time())
# status = True
# num = 0
#
# while status:
# t2 = int(time.time())
# if t2 - t1 < 30:
# new_height = wb.execute_script(js)
# if new_height > height:
# time.sleep(1)
# wb.execute_script(
# 'window.scrollTo(0, document.body.scrollHeight)')
# height = new_height
# t1 = int(time.time())
# elif num < 3:
# time.sleep(3)
# num = num + 1
# else: # 超时且重试后停止,到底页面底部
# status = False
#
# data = get_info2(wb)
#
# return data
# ==可以跑通的部分===========================================================================
# #chromedriver直接获取posts
# ##获取url链接
# url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
# post_list = get_posts(url)
#
#
# wb = webdriver.Chrome()
# wb.get(url)
# time.sleep(3)
# selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
# like = selector.xpath("//div[@data-testid='like']//div/span/span/text()")
# print(like)
#
# data = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
# post_list = []
# for i in range(len(data)):
# post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行
# print(post)
# post = str(post)
# post_list.append("".join(post))
#
#
# =============================================================================
# 获取页面内所有帖子的url
def get_posts(url):
"""
url:包含所有帖子的浏览页面
"""
wb = webdriver.Chrome()
wb.get(url)
time.sleep(3)
#处理网页加载
js = 'return action=document.body.scrollHeight'
height = wb.execute_script(js)
wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
t1 = int(time.time())
status = True
num = 0
post_list = []
while status:
t2 = int(time.time())
if t2 - t1 < 30:
selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")#//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div
for info in infos:
post = info.xpath("string(.)").strip()
post_list.append(post)
new_height = wb.execute_script(js)
if new_height > height:
time.sleep(1)
wb.execute_script(
'window.scrollTo(0, document.body.scrollHeight)')
height = new_height
t1 = int(time.time())
elif num < 3:
time.sleep(3)
num = num + 1
else: # 超时且重试后停止,到底页面底部
status = False
return post_list
##获取页面html,或许是因为异步加载的数据的完整页面代码无法获取的原因
# selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
# infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#
# post_list = []
# for info in infos:
# post = info.xpath("string(.)").strip()
# post_list.append(post)
# return post_list
#
# #使用webelement来进行爬取
# infos = wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
# post_list = []
# for info in infos:
# post = info.text.strip()
# post_list.append(post)
# return post_list
url = 'https://twitter.com/search?q=Beijing%20Winter%20Olympics%20Opening%20Ceremony&src=typed_query'
post_list = get_posts(url)
comm_df = pd.DataFrame(post_list)
print('here')
comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_Olympic_ceremony2.csv', encoding='utf_8_sig', index=False)
##
##只要推文的数据
#url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nuclear.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=shenzhou-13&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_shenzhou.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=China%20lunar%20soil&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinalunar.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Abdulrazak%20Gurnah%20Nobel%20Prize%20in%20Literature&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nobel.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Vietnam%20Factories%20&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_vietnam.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=China%20provide%20vaccines&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinavaccine.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Impact%20of%20Brexit%20on%20economy%20%27worse%20than%20Covid%27&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_brexiteconomy.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=rich%20countries%20hogging%20vaccines&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_richhogging.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=ease%20travel%20restrictions&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_easetravelres.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=US%20reaches%20agreement%20to%20end%20European%20digital%20services%20taxes&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_agreeontaxr.csv', encoding='utf_8_sig', index=False)
#
#1,2,3(1,2,3,4)
#
#
#
#like = info.xpath("//div[@data-testid='like']//div/span/span/text()")
#retweet = info.xpath("//div[@data-testid='retweet']//div/span/span/text()")
#reply = info.xpath("//div[@data-testid='reply']//div/span/span/text()")
#data = {
# "post":post,
# "like":like,
# "retweet":retweet,
# "reply":reply}
#
#
#
#
#
#
##获取页面中的数据
#def get_info(wb,url,list,m):
# driver.implicitly_wait(10)
# m.append(driver.page_source)
# selector = html.etree.HTML(driver.page_source)# # 是将HTML转化为二进制/html 格式
# #可以先获取代码,再慢慢尝试写出一个可以使用的xpath或者正则表达式
# m.append(selector)
#
# infos = selector.xpath("//div[@class='css-1dbjc4n']")#相当于一个网页中有多个这个结构,然后把所有这个结构的数据都放在list中遍历取出需要的数据
# m.append(infos)
## print(infos)
# for info in infos:
# ###需要的信息
# # data = info.xpath("//a[@class='J_ClickStat']/@href")##找商品的名字,写一个大一点的范围
## good = data.xpath("string(.)").strip()
# post = []
# data= info.xpath("//*/div[@class='css-1dbjc4n']/div/span/text()")
# for i in range(len(data)):
# post.append(data[i].xpath('string(.)'))
## post= data.xpath("string(.)").strip()
## for span in post:
## #格式化当前节点
## post =span.xpath('string(.)')
# like = info.xpath("//*/div[@data-testid='like']//span/span/text()")
#
# retweet = info.xpath("//*/div[@data-testid='retweet']//span/span/text()")
# reply = info.xpath("//*/div[@data-testid='reply']//span/span/text()")
# data = {
## "good":good,
# "post":post,
# "like":like,
# "retweet":retweet,
# "reply":reply
# }
# print(data)
# list.append(data)
# return list
#