scrapy 为什么我的正则表达式与以下字符串不匹配?

irlmq6kh  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(223)

我尝试从类似于python中ECON 114 - 01   Adv Quant Methods的字符串中提取类缩写(Econ 114)和名称(Adv Quant Methods)。
我使用的表达式r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)'在我的regex测试器中是有效的。但是,当我在scrapy中运行这个表达式时,返回数组是空的。我做错了什么?(代码如下)

import scrapy;
import re as pythonRe;

# with open('../econ.html', 'r') as f:

    #html_string = f.read()

econ_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded'}

class ClassesSpider(scrapy.Spider):
    name = "classes"

    def start_requests(self):

        urls = [
            'https://pisa.usc.edu/class_search/index.php'
            ]
        for url in urls:
            yield scrapy.Request(url=url, method="POST", headers=econ_headers, body='action=results&binds%5B%3Aterm%5D=2228&binds%5B%3Areg_status%5D=all&binds%5B%3Asubject%5D=ECON&binds%5B%3Acatalog_nbr_op%5D=%3D&binds%5B%3Acatalog_nbr%5D=&binds%5B%3Atitle%5D=&binds%5B%3Ainstr_name_op%5D=%3D&binds%5B%3Ainstructor%5D=&binds%5B%3Age%5D=&binds%5B%3Acrse_units_op%5D=%3D&binds%5B%3Acrse_units_from%5D=&binds%5B%3Acrse_units_to%5D=&binds%5B%3Acrse_units_exact%5D=&binds%5B%3Adays%5D=&binds%5B%3Atimes%5D=&binds%5B%3Aacad_career%5D=&binds%5B%3Aasynch%5D=A&binds%5B%3Ahybrid%5D=H&binds%5B%3Asynch%5D=S&binds%5B%3Aperson%5D=P', callback=self.parse)

    def parse(self, response):
        def professor_filter(item):
          if (pythonRe.search(r'\w\.', item) or "Staff" in item):
            return True

        #class_regex = pythonRe.compile(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
        page = response.url.split("/")[-2]
        classDict = {}
        classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
        professors = response.xpath('//div[contains(@class, "col-xs-6 col-sm-3")]/text()').getall()

        professors_filtered = list(filter(professor_filter, professors))

        #for x in range((len(classes))):
          #classDict[classes[x]] = {'professor': professors_filtered[x]}

        print(classes)
        print(len(classes))
        print(professors_filtered)
        print(len(professors_filtered))
        print(professors)
        print(classDict)

        filename = f'class-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log(f'Saved file {filename}')
64jmpszr

64jmpszr1#

如果您首先获取classes的全文并显示它
那么你会发现scrapy给出\xa0而不是 
你必须使用\xa0+而不是[ ]+

classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+[\xa0]+([\w\s]+\b)')

这给了我:

classes: ['ECON 1', 'Intro Microeconomic', 'ECON 1', 'Intro Microeconomic', 'ECON 2', 'Intro Macroeconomic', 'ECON 10A', 'Econ of Accounting', 'ECON 10A', 'Econ of Accounting', 'ECON 11A', 'Math Methd for Econ', 'ECON 11B', 'Math Methds Econ II', 'ECON 100A', 'Intermed Microecon', 'ECON 100A', 'Intermed Microecon', 'ECON 100B', 'Intermed Macroecon', 'ECON 101', 'Managerial Econ', 'ECON 104', 'Numbr Truth', 'ECON 111A', 'Intermed Account I', 'ECON 113', 'Intro Econometrics', 'ECON 113', 'Intro Econometrics', 'ECON 114', 'Adv Quant Methods', 'ECON 117B', 'Tax Factors', 'ECON 125', 'Econ History Of US', 'ECON 126', 'Why Succeed', 'ECON 133', 'Security Markets', 'ECON 136', 'Business Strategy', 'ECON 141', 'Internatl Finance', 'ECON 150', 'Public Finance', 'ECON 161A', 'Marketing', 'ECON 166A', 'Game Theory']

我认为问题是因为response.body给出了HTML的原始字符串,但其他函数可能必须将此字符串转换为HTML Tree(如在模块lxmlBeautifulSoup中),并且它可能会自动将html entities(如 )转换为字符。
正如我所知,scrapy使用parsel来选择HTML中的元素。
参见Scrapy文档:Selectors

编辑:

包含其他更改的完整工作代码

  • 我使用FormRequest
  • 首先在表中查找行,然后在每一行中分别查找班级和教授。
import scrapy
import re

econ_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
}

data = {
    'action': 'results',
    'binds[:term]': '2228',
    'binds[:reg_status]': 'all',
    'binds[:subject]': 'ECON',
    'binds[:catalog_nbr_op]': '=',
    'binds[:catalog_nbr]': '',
    'binds[:title]': '',
    'binds[:instr_name_op]': '=',
    'binds[:instructor]': '',
    'binds[:ge]': '',
    'binds[:crse_units_op]': '=',
    'binds[:crse_units_from]': '',
    'binds[:crse_units_to]': '',
    'binds[:crse_units_exact]': '',
    'binds[:days]': '',
    'binds[:times]': '',
    'binds[:acad_career]': '',
    'binds[:asynch]': 'A',
    'binds[:hybrid]': 'H',
    'binds[:synch]': 'S',
    'binds[:person]': 'P',
}

def professor_filter(item):
    return (re.search(r'\w\.', item) or "Staff" in item)

class ClassesSpider(scrapy.Spider):

    name = "classes"

    def start_requests(self):
        urls = ['https://pisa.ucsc.edu/class_search/index.php']
        for url in urls:
            #yield scrapy.Request(url,
            #                     headers=econ_headers,
            #                     body='action=results&binds%5B%3Aterm%5D=2228&binds%5B%3Areg_status%5D=all&binds%5B%3Asubject%5D=ECON&binds%5B%3Acatalog_nbr_op%5D=%3D&binds%5B%3Acatalog_nbr%5D=&binds%5B%3Atitle%5D=&binds%5B%3Ainstr_name_op%5D=%3D&binds%5B%3Ainstructor%5D=&binds%5B%3Age%5D=&binds%5B%3Acrse_units_op%5D=%3D&binds%5B%3Acrse_units_from%5D=&binds%5B%3Acrse_units_to%5D=&binds%5B%3Acrse_units_exact%5D=&binds%5B%3Adays%5D=&binds%5B%3Atimes%5D=&binds%5B%3Aacad_career%5D=&binds%5B%3Aasynch%5D=A&binds%5B%3Ahybrid%5D=H&binds%5B%3Asynch%5D=S&binds%5B%3Aperson%5D=P',
            #                     callback=self.parse)

            yield scrapy.FormRequest(url,
                                 headers=econ_headers,
                                 formdata=data,
                                 callback=self.parse)

    def parse(self, response):

        page = response.url.split("/")[-2]

        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')

        classDict = {}

        for row in all_rows:
            classname = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
            print(classname, professor)
            if professor and professor_filter(professor):
                classDict[tuple(classname)] = [professor]
                yield {'class': tuple(classname), 'professor': professor}  # it will write to file csv
            else:
                print('skip:', professor)
        print(classDict)

        #filename = f'class-{page}.html'
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        #self.log(f'Saved file {filename}')

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    #'USER_AGENT': 'Mozilla/5.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ClassesSpider)
c.start()

相关问题