我尝试从类似于python中ECON 114 - 01 Adv Quant Methods
的字符串中提取类缩写(Econ 114)和名称(Adv Quant Methods)。
我使用的表达式r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)'
在我的regex测试器中是有效的。但是,当我在scrapy中运行这个表达式时,返回数组是空的。我做错了什么?(代码如下)
import scrapy;
import re as pythonRe;
# with open('../econ.html', 'r') as f:
#html_string = f.read()
econ_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.usc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
class ClassesSpider(scrapy.Spider):
name = "classes"
def start_requests(self):
urls = [
'https://pisa.usc.edu/class_search/index.php'
]
for url in urls:
yield scrapy.Request(url=url, method="POST", headers=econ_headers, body='action=results&binds%5B%3Aterm%5D=2228&binds%5B%3Areg_status%5D=all&binds%5B%3Asubject%5D=ECON&binds%5B%3Acatalog_nbr_op%5D=%3D&binds%5B%3Acatalog_nbr%5D=&binds%5B%3Atitle%5D=&binds%5B%3Ainstr_name_op%5D=%3D&binds%5B%3Ainstructor%5D=&binds%5B%3Age%5D=&binds%5B%3Acrse_units_op%5D=%3D&binds%5B%3Acrse_units_from%5D=&binds%5B%3Acrse_units_to%5D=&binds%5B%3Acrse_units_exact%5D=&binds%5B%3Adays%5D=&binds%5B%3Atimes%5D=&binds%5B%3Aacad_career%5D=&binds%5B%3Aasynch%5D=A&binds%5B%3Ahybrid%5D=H&binds%5B%3Asynch%5D=S&binds%5B%3Aperson%5D=P', callback=self.parse)
def parse(self, response):
def professor_filter(item):
if (pythonRe.search(r'\w\.', item) or "Staff" in item):
return True
#class_regex = pythonRe.compile(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
page = response.url.split("/")[-2]
classDict = {}
classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
professors = response.xpath('//div[contains(@class, "col-xs-6 col-sm-3")]/text()').getall()
professors_filtered = list(filter(professor_filter, professors))
#for x in range((len(classes))):
#classDict[classes[x]] = {'professor': professors_filtered[x]}
print(classes)
print(len(classes))
print(professors_filtered)
print(len(professors_filtered))
print(professors)
print(classDict)
filename = f'class-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
1条答案
按热度按时间64jmpszr1#
如果您首先获取
classes
的全文并显示它那么你会发现
scrapy
给出\xa0
而不是
你必须使用
\xa0+
而不是[ ]+
这给了我:
我认为问题是因为
response.body
给出了HTML的原始字符串,但其他函数可能必须将此字符串转换为HTML Tree
(如在模块lxml
或BeautifulSoup
中),并且它可能会自动将html entities
(如
)转换为字符。正如我所知,
scrapy
使用parsel来选择HTML中的元素。参见Scrapy文档:Selectors
编辑:
包含其他更改的完整工作代码