我有一个代码,它可以遍历csv中的数据,搜索csv中的名字,然后通过scrapy过滤,在每个搜索页面上获取电子邮件。
请忽略我糟糕的代码。如果你有任何意见,如何使它更好,那么这将是非常感谢。
下面是代码:
from email import header
import email
from posixpath import split
from sqlite3 import Row
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import csv
from emails_on_site import EmailsSpider
from platform import python_branch
import re
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import requests
from platform import python_branch
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from platform import python_branch
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
engine = 'python'
url_list = []
email_list = []
email_file = "emails.csv"
engine = 'python'
contacts = []
pattern = re.compile(r"[\w\.]+@[\w\.]+")
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
s=Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=chrome_options)
driver.implicitly_wait(3)
URL = "http://www.directory.ucla.edu/search.php"
# update URL
driver.get(URL)
page = requests.get(URL)
# update info below
m = driver.find_element(By.ID, "q")
providers = open("names.csv")
data = csv.reader(providers)
def goLeft():
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
m.send_keys(Keys.LEFT)
class EmailsSpider(CrawlSpider):
name = 'emails'
# change the below URLS
allowed_domains = ['directory.ucla.edu/search.php']
start_urls = ['http://www.directory.ucla.edu/search.php']
rules = (
Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),
)
def parse_item(self, response):
emails = re.findall('[\w\.]+@[\w\.]+' , response.text)
names = re.findall('^([A-Za-z \-]{2,25})+$', response.text)
for email in emails:
yield {
'Email': email
}
print(names)
print(emails)
for row in data:
m.send_keys(str(row))
m.send_keys(Keys.BACK_SPACE)
m.send_keys(Keys.BACK_SPACE)
goLeft()
m.send_keys(Keys.RIGHT)
m.send_keys(Keys.RIGHT)
m.send_keys(Keys.BACK_SPACE)
m.send_keys(Keys.BACK_SPACE)
m.send_keys(Keys.RETURN)
time.sleep(3)
try:
EmailsSpider
except:
contacts.append(str(row) + "was unable to be found")
print(contacts)
time.sleep(2)
m.clear()
iter(row)
continue
以下是CSV:
Nicole Green
Clark Kent
Steve Ballmer
Elon Musk
下面是我尝试自动化抓取的示例html:
<div class="email">
<a href="mailto:ngreen%40caps.ucla.edu">
<img src="/img/email.png" alt="ngreen@caps.ucla.edu">
</a>
</div>
我如何在img标签中抓取alt文本?
我试过了:[这个] https://medium.com/swlh/web-scraping-with-selenium-scrapy-9d9c2e9d83b1、[这个] https://scrapeops.io/python-scrapy-playbook/scrapy-selenium/、[这个] Python with Selenium "element is not attached to the page document"
1条答案
按热度按时间4dc9hkyq1#
加载并保存CSV中的行,我认为你可以。下面是一个简单的例子,如何获得一封电子邮件
输出: