import requests
from bs4 import BeautifulSoup
import time
import requests
def get_url(*args):
matched_index =[]
url_site = []
for a in args:
city_country_match = False
city_country = a.replace(" ","-")
sitemap_index = 5
sitemap_index_max = 255
while not city_country_match:
res = requests.get(f'https://weatherspark.com/sitemap- {sitemap_index}.xml')
soup = BeautifulSoup(res.content,"xml")
if city_country in res.text: #is city/country within the sitemap.xml page?
city_country_match = True #ends while loop
for loc in soup.select('loc'):
text = loc.text
if 'Average-Weather-in-' in text and 'Year-Round' in text and city_country in text:
url_site.append(text) #***WHY DOES THIS NOT APPEND? ***
print(f'Found {a} on page {sitemap_index}')
matched_index.append(sitemap_index)
url_site.append(text)
time.sleep(1.5)
else:
sitemap_index = sitemap_index + 1
if sitemap_index == sitemap_index_max:
print(f"Did not find {city_country}")
return matched_index
return url_site
get_url("Austin Texas United States, "Calgary Canada")
实际结果:
在第20页上找到美国德克萨斯州奥斯汀
在第9页找到加拿大卡尔加里
产出:[20,9]
预期结果是它还将包括在输出中:
[”https://weatherspark.com/y/8004/Average-Weather-in-Austin-Texas-United-States-Year-Round“、“https://weatherspark.com/y/2349/Average-Weather-in-Calgary-Canada-Year-Round“]
1条答案
按热度按时间b09cbbtk1#
您可以尝试返回:
您的代码当前无法工作的原因是因为您有两个return语句,这意味着第二个return url_site将永远无法到达。