python 使用BeautifulSoup跟踪HTML中的链接

zujrkrfu  于 2022-12-10  发布在  Python
关注(0)|答案(9)|浏览(218)

我正在做的一门课程,需要我用BeautifulSoup解析这个:http://python-data.dr-chuck.net/known_by_Fikret.html
说明如下:找到位置3的链接(名字是1)。按照该链接。重复此过程4次。答案就是您检索到的姓氏。
这是我目前拥有的代码:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re

url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
taglist = list()

tags = soup('a')

for i in range(count):
    for tag in tags:
        taglist.append(tag)
    url = taglist[pos].get('href', None)
    print('Retrieving: ', url)
    urllist.append(url)
print('Last URL: ', urllist[-1])

以下是我的输出:

Retrieving:  http://python-data.dr-chuck.net/known_by_Fikret.html 
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Last URL:  http://python-data.dr-chuck.net/known_by_Montgomery.html

下面是我应该得到的输出:

Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://python-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://python-data.dr-chuck.net/known_by_Anayah.html
Last URL:  http://python-data.dr-chuck.net/known_by_Anayah.html

我已经在这方面工作了一段时间,但我仍然不能让代码正确循环。我是新的编码,我只是寻找一些帮助,以指出我在正确的方向。谢谢。

yqkkidmi

yqkkidmi1#

试试这个办法:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url=input("Enter url:")

count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1

urllist=list()

for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)

    url=taglist[pos]

    urllist.append(url)

print("Last Url:",urllist[-2])
zte4gxcn

zte4gxcn2#

def get_html(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

urllist = list()

 for i in range(count):
    taglist = list()

    for tag in get_html(url)('a'): # Needed to update your variable to new url html
        taglist.append(tag)

     url = taglist[pos].get('href', None) # You grabbed url but never updated your tags variable.

    print('Retrieving: ', url)
    urllist.append(url)

 print('Last URL: ', urllist[-1])
c6ubokkw

c6ubokkw3#

您在同一个pos位置多次获取链接。使用i循环计数器获取偏移量,替换:

url = taglist[pos].get('href', None)

与:

url = taglist[pos + i].get('href', None)
whlutmcx

whlutmcx4#

您没有得到正确答案的原因如下:您不打开链接。
在第一页找到正确的url后,你必须用urllib.request.urlopen(URL).read()打开你找到的url,并在那里寻找新的链接。你必须重复三次。我建议用while循环来完成这个任务。
下面的代码可以实现这个目的:

url =  'http://python-data.dr-chuck.net/known_by_Fikret.html'
count = 5
pos = 2
urllist = []
taglist = []

connections = 0 
while connections < 5 : #you need to connect five times
    taglist = []
    print('Retrieving: ', url)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')

    for i in range(count):
        for tag in tags:
            taglist.append(tag)

    url = taglist[pos].get('href', None)
    urllist.append(url)

    connections = connections + 1  
print ("last url:", url)
smtd7mpg

smtd7mpg5#

试试这个:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def parse(url):
    count=0
    while count<7:
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        list1=list()
        tags = soup('a')
        for tag in tags:
            list1.append(tag.get('href', None))
        url=list1[17]
        count+=1
        print ('Retreiving:',url)

print (parse('http://py4e-data.dr-chuck.net/known_by_Lorenz.html'))

这就是我的输出:

Retreiving: http://py4e-data.dr-chuck.net/known_by_Cadyn.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Phebe.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cullen.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Alessandro.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Gurveer.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Anureet.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Sandie.html
None
kjthegm6

kjthegm66#

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
taglist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    for tag in tags: 
    # the most important part is keep updating the variable of tags by putting in front of this loop
        taglist.append(tag)
    print('Retrieving: ', url)
    url = taglist[pos].get('href', None) 
    urllist.append(url)

print('Retrieving: ', urllist[-1])
atmip9wb

atmip9wb7#

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    url = tags[pos].get('href', None) 
    print('Retrieving: ', url)
    urllist.append(url)

print('Retrieving: ', urllist[-1])
kuuvgm7e

kuuvgm7e8#

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)
    url=taglist[pos]
    urllist.append(url)
x=len(urllist)
print("Last Url:",urllist[x-1])
bhmjp9jg

bhmjp9jg9#

#assignment2
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl



count = 7
position = 18

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Retrieve all of the anchor tags



#run1

idea = ['https://py4e-data.dr-chuck.net/known_by_Lynn.html']

empty = []

for i in range(count+1):
    url = idea[len(idea)-1]
    print("retrieving:", url)
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    
    for tag in tags:
        empty.append(tag.get('href',None))
        
        
    idea.append(empty[position-1])
    empty.clear()

相关问题