有没有一种简单的方法可以将抓取的输出打印到csv文件中?

6za6bjd0  于 2023-03-27  发布在  其他
关注(0)|答案(3)|浏览(184)
  • Python:Python 3.11.2 Python编辑器:PyCharm 2022.3.3(Community Edition)- Build PC-223.8836.43操作系统:Windows 11专业版,22 H2,22621.1413浏览器:Chrome 111.0.5563.65(官方版本)(64位)*

我有一个URL(例如,https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963),我从其中抓取九个项目。我希望有脚本创建一个csv文件,并将我的抓取输出(九个项目)写入csv文件中的列。有没有一个真正简单的方法来做到这一点?

from bs4 import BeautifulSoup
import requests
import csv

html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_ = "wrapper jcard has-padding-30 blocks has-no-bottom-padding")

for case in cases:
    case_title = case.find("div", class_ = "title-wrapper").text.replace(" "," ")
    case_plaintiff = case.find("td", {"data-th": "Plaintiff"}).text.replace(" "," ")
    case_defendant = case.find("td", {"data-th": "Defendant"}).text.replace(" "," ")
    case_number = case.find("td", {"data-th": "Case Number"}).text.replace(" "," ")
    case_filed = case.find("td", {"data-th": "Filed"}).text.replace(" "," ")
    court = case.find("td", {"data-th": "Court"}).text.replace(" "," ")
    case_nature_of_suit = case.find("td", {"data-th": "Nature of Suit"}).text.replace(" "," ")
    case_cause_of_action = case.find("td", {"data-th": "Cause of Action"}).text.replace(" "," ")
    jury_demanded = case.find("td", {"data-th": "Jury Demanded By"}).text.replace(" "," ")

    print(f"{case_title.strip()}")
    print(f"{case_plaintiff.strip()}")
    print(f"{case_defendant.strip()}")
    print(f"{case_number.strip()}")
    print(f"{case_filed.strip()}")
    print(f"{court.strip()}")
    print(f"{case_nature_of_suit.strip()}")
    print(f"{case_cause_of_action.strip()}")
    print(f"{jury_demanded.strip()}")
up9lanfz

up9lanfz1#

用你的数据生成一个列表列表,并将其转储到csv中:

from bs4 import BeautifulSoup
import requests
import csv

html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_ = "wrapper jcard has-padding-30 blocks has-no-bottom-padding")

output = []
for case in cases:
    case_title = case.find("div", class_ = "title-wrapper").text.replace(" "," ")
    case_plaintiff = case.find("td", {"data-th": "Plaintiff"}).text.replace(" "," ")
    case_defendant = case.find("td", {"data-th": "Defendant"}).text.replace(" "," ")
    case_number = case.find("td", {"data-th": "Case Number"}).text.replace(" "," ")
    case_filed = case.find("td", {"data-th": "Filed"}).text.replace(" "," ")
    court = case.find("td", {"data-th": "Court"}).text.replace(" "," ")
    case_nature_of_suit = case.find("td", {"data-th": "Nature of Suit"}).text.replace(" "," ")
    case_cause_of_action = case.find("td", {"data-th": "Cause of Action"}).text.replace(" "," ")
    jury_demanded = case.find("td", {"data-th": "Jury Demanded By"}).text.replace(" "," ")

    output.append([
                    case_title.strip()
                    ,case_plaintiff.strip()
                    ,case_defendant.strip()
                    ,case_number.strip()
                    ,case_filed.strip()
                    ,court.strip()
                    ,case_nature_of_suit.strip()
                    ,case_cause_of_action.strip()
                    ,jury_demanded.strip()
                  ])

with open("output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(output)
8yoxcaq7

8yoxcaq72#

当然-最简单的是标准库csv模块。
我冒昧地用一个函数重构了您的.replace().strip();我们还将所有的案例数据收集到一个字典列表中,然后再将其写入文件。这使得添加新列变得更容易,而不必两次处理它们的名称。

from bs4 import BeautifulSoup
import requests
import csv

def process_text(s):
    return s.replace(" ", " ").strip()

html_text = requests.get("https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_="wrapper jcard has-padding-30 blocks has-no-bottom-padding")

data = []

for case in cases:
    data.append(
        {
            "case_title": process_text(case.find("div", class_="title-wrapper")),
            "case_plaintiff": process_text(case.find("td", {"data-th": "Plaintiff"})),
            "case_defendant": process_text(case.find("td", {"data-th": "Defendant"})),
            "case_number": process_text(case.find("td", {"data-th": "Case Number"})),
            "case_filed": process_text(case.find("td", {"data-th": "Filed"})),
            "court": process_text(case.find("td", {"data-th": "Court"})),
            "case_nature_of_suit": process_text(case.find("td", {"data-th": "Nature of Suit"})),
            "case_cause_of_action": process_text(case.find("td", {"data-th": "Cause of Action"})),
            "jury_demanded": process_text(case.find("td", {"data-th": "Jury Demanded By"})),
        }
    )

with open("cases.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=data[0].keys())
    writer.writeheader()
    writer.writerows(data)
9rnv2umw

9rnv2umw3#

pandas有一个.to_csv方法。

# import pandas as pd
csv_filename = 'x.csv' #<--name or path to file

th_fields = ['Plaintiff', 'Defendant', 'Case Number', 'Filed', 'Court', 
             'Nature of Suit', 'Cause of Action', 'Jury Demanded By']
case_rows = []
for c in cases:
    title = c.find("div", class_ = "title-wrapper")
    row = {'title': title.text.strip()} if title else {}
    for td in c.find_all('td',{'data-th':(lambda th: th in th_fields)}): 
        row[td['data-th']] = td.text.strip()
    case_rows.append(row)

pd.DataFrame(case_rows).to_csv(csv_filename, index=False)
# pd.DataFrame(case_rows).T.to_csv(csv_filename, index=False, header=False)

你也可以用.Ttranspose

您也可以直接使用read_html从URL获取所有表

# import pandas as pd
url=f'https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963'
csv_filename = 'x.csv'
for ti, df in enumerate(pd.read_html(url)):
    df.to_csv(csv_filename, mode='a' if ti else 'w', index=False)
    pd.DataFrame({'':[]}).to_csv(csv_filename, mode='a')

相关问题