pandas 如何用python从一个页面抓取一个表并创建一个多列 Dataframe ?

3ks5zfa0  于 2022-12-28  发布在  Python
关注(0)|答案(2)|浏览(105)

本网站https://aviation-safety.net/wikibase/ DB始于1902年至2022年。我试图为2015年和2016年的每起事故列出表格、叙述、可能原因和分类:https://aviation-safety.net/database/dblist.php?Year=2015。使用下面的代码,我只能抓取表:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')

        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
           
            for i in table1.find_all('tr'):
                title = i.text
                tl.append(title)
                
                
    df= pd.DataFrame(tl)
    df.columns = ['status'] 
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)

但数据没有组织。 Dataframe 如下所示:

结果应如下所示:

ztmd8pv5

ztmd8pv51#

看起来tl的值是字符串,例如'Status:Accident investigation report completed and information captured'
将字符串列表转换为pd.DataFrame可以得到一个列,其中包含列表中的所有值。
如果你想使用字符串的"name",例如Status作为列标题,你需要将它与文本的其余部分分开。

# maxsplit of 1 so we don't accidentally split up the values, e.g. time
title, text = title.split(":", maxsplit=1)

这看起来像

('Status', 'Accident investigation report completed and information captured')

现在我们创建一个字典

row_dict[title] = text

给我们

{'Status': 'Accident investigation report completed and information captured'}

我们将在最后一个循环中添加到这个字典中

# old
for i in table1.find_all('tr'):
    title = i.text
    tl.append(title)
# new
row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

在我们从页面收集了所有数据之后,即完成了row_dict循环,我们将追加到tl

row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

tl.append(row_dict)

现在大家一起

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')

        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
            # make changes here!!!!!!!
            row_dict = {}
            for i in table1.find_all('tr'):
                title = i.text
                title, text = title.split(":", maxsplit=1)
                row_dict[title] = text
            
            tl.append(row_dict)
                
    df= pd.DataFrame(tl)
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)
j91ykkif

j91ykkif2#

read_html()方法提供了对此类数据集的方便访问。

>>> url = "https://web.archive.org/web/20221027040903/https://aviation-safety.net/database/dblist.php?Year=2015"
>>>
>>> dfs = pd.read_html(url)
>>>
>>> df = dfs[1].drop(columns="operator").dropna(axis=1, how="all")
>>> df["date"] = pd.to_datetime(df.date.str.replace("??-", "01-", regex=False), format="%d-%b-%Y")
>>> df.set_index("date")
                                 type registration  fat.              location cat
date                                                                              
2015-01-02                  Saab 340B       G-LGNL     0       Stornoway Ai...  A1
2015-01-03         Antonov An-26B-100     RA-26082     0       Magadan-Soko...  A1
2015-01-04                  Fokker 50       5Y-SIB     0       Nairobi-Jomo...  A1
2015-01-08  Bombardier Challenger 300       PR-YOU     0       São Paulo-Co...  O1
2015-01-09  Cessna 208B Grand Caravan       8R-GAB     0       Matthews Rid...  A2
...                               ...          ...   ...                   ...  ..
2015-06-11                Eclipse 500       N508JA     0       Sacramento-E...  A2
2015-06-11               Hawker 800XP       N497AG     0       Port Harcour...  A1
2015-06-12             Boeing 737-33A       VH-NLK     0  near Kosrae Airpo...  I2
2015-06-15              Antonov An-2R     RA-84553     0       Tatsinsky di...  A1
2015-06-16        Boeing 737-322 (WL)       LY-FLB     0       Aktau Airpor...  O1

[100 rows x 5 columns]

很难控制user-agent头文件,所以要么使用一个合作站点,要么使用requests或curl做一些额外的工作来预先获得html文本。

相关问题