我想链接的Python文件和HTML页面使用 flask 应用程序。第一页(upload.html页面),用户需要从两个下拉菜单中选择工作字段和职位。然后,用户需要上传包含URL列表的文件,这些URL将在用户单击按钮上传后使用自动编码进行抓取。问题是我只成功地显示了那些不干净的数据。
下面是上传页面广告展示页面的HTML代码片段代码:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>UPLOAD FILE</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" />
</head>
<body>
<div class="wrapper2">
<nav class="navbar">
<ul>
<li><a href="#">HOME</a></li>
<li><a href="#">UPLOAD FILE</a></li>
<li><a href="#">DETAILS</a></li>
<li><a href="#">CLASSIFICATION</a></li>
<li><a href="#">SHORTLISTED</a></li>
<li><a href="#">ANALYSIS</a></li>
</ul>
</nav>
<div class="job">
<h2>Choose job's field: </h2>
<select name="job-field" id="job-field">
<option value="" selected="selected">Select field</option>
</select>
<br><br>
<h2>Choose job's title: </h2>
<select name="job-title" id="job-title">
<option value="" selected="selected">Select title</option>
</select>
</div>
<div class="upload">
<h2>Upload a CSV file: </h2>
<form action="/upload" method="post" enctype="multipart/form-data">
<input type="file" name="file">
<br><br>
<button type="submit">Upload</button>
</form>
</div>
</div>
<script src="{{ url_for('static', filename='script.js') }}"></script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Scraped Information</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
<nav class="navbar">
<ul>
<li><a href="#">HOME</a></li>
<li><a href="#">UPLOAD FILE</a></li>
<li><a href="#">DETAILS</a></li>
<li><a href="#">CLASSIFICATION</a></li>
<li><a href="#">SHORTLISTED</a></li>
<li><a href="#">ANALYSIS</a></li>
</ul>
</nav>
<h1>{{ job_field }}</h1>
{% if job_field == 'IT' %}
<table>
<thead>
<tr>
<th>Job's Title</th>
<th>Location</th>
<th>Experiences</th>
<th>Education</th>
<th>Certifications</th>
<th>Skills</th>
<th>Languages</th>
</tr>
</thead>
<tbody>
<tr>
<td>Senior FrontEnd Developer</td>
<td>No specific</td>
<td>No specific</td>
<td>Bachelor Degree in Software Engineering, Computer Science or equivalent</td>
<td>No specific</td>
<td>html,css,sass,js,gulp,webpack,jquery,vuejs,vuex,inertiajs,laravel,php,</td>
<td>No specific</td>
</tr>
</tbody>
</table>
{% elif job_field == 'DM' %}
<p>Details for Digital Marketing go here...</p>
{% elif job_field == 'FN' %}
<p>Details for Finance go here...</p>
{% endif %}
<h2>Scraped Information</h2>
<table>
<thead>
<tr>
<th>Name</th>
<th>Title</th>
<th>Location</th>
<th>Experiences</th>
<th>Education</th>
<th>Certifications</th>
<th>Skills</th>
<th>Languages</th>
</tr>
</thead>
<tbody>
{% for item in data %}
<tr>
<td>{{ item["Name"] }}</td>
<td>{{ item["Title"] }}</td>
<td>{{ item["Location"] }}</td>
<td>{{ item["Experiences"] }}</td>
<td>{{ item["Education"] }}</td>
<td>{{ item["Certifications"] }}</td>
<td>{{ item["Skills"] }}</td>
<td>{{ item["Languages"] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</body>
</html>
下面是flask应用程序的代码:
from flask import Flask, render_template, request, redirect, url_for
import pandas as pd
from scraper import scrape_data
from data_cleaning import clean_data
app = Flask(__name__)
app.static_folder = 'static'
@app.route('/')
def home():
return render_template('home_page.html')
@app.route('/upload_page')
def upload_page():
return render_template('upload_page.html')
@app.route('/process', methods=['POST'])
def process():
job_field = request.form['job_field']
return redirect(url_for('display', job_field=job_field))
@app.route('/upload', methods=['POST'])
def upload():
file = request.files['file']
df = pd.read_csv(file)
data = scrape_data(df)
cleandata = clean_data(data)
return render_template('display.html', data=cleandata)
if __name__ == '__main__':
app.run(debug=True)
下面是scraping的代码:
def scrape_data(df):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome()
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
sleep(2)
# Task 1.2: Import username and password
username = "rinarinza@gmail.com"
password = "Zahra2011"
print('- Finish importing the login credentials')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[@type="submit"]')
signin_field.click()
sleep(3)
df.to_csv('mydata.csv', index=False)
data = []
with open('mydata.csv', 'r') as csvfile:
csvreader = csv.reader(csvfile)
for url in csvreader:
# Task 1: Login to Linkedin
# sleep(2)
profile_url = url[0]
driver.get(profile_url)
start = time.time()
initialScroll = 0
finalScroll = 1000
while True:
driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
# this command scrolls the window starting from the pixel value stored in the initialScroll
# variable to the pixel value stored at the finalScroll variable
initialScroll = finalScroll
finalScroll += 1000
# we will stop the script for 3 seconds so that the data can load
time.sleep(2)
end = time.time()
# We will scroll for 20 seconds.
if round(end - start) > 20:
break
src = driver.page_source
soup = BeautifulSoup(driver.page_source, 'html.parser')
# print(soup.prettify())
# 1 NAME
intro = soup.find('div', {'class': 'pv-text-details__left-panel'})
name_loc = intro.find("h1")
name = name_loc.get_text().strip()
# Title
title = soup.find('div', {"class": "text-body-medium break-words"}).get_text().replace('\n', '').strip()
# 2 LOCATION
loc = soup.find('span', {'class': 'text-body-small inline t-black--light break-words'})
location = loc.get_text().strip()
# 3 EXPERIENCE
experiences = []
try:
experience_div = soup.find('div', {"id": "experience"})
exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
'ul').findAll('li',
{"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})
for each_exp in exp_list:
exp_temp = {
each_exp.findNext('span', {"class": "t-14 t-normal t-black--light"}).findNext(
'span').get_text().replace('\n', '').strip() # timeframe
}
experiences.append(exp_temp)
# handle the profiles which does not contain experience section
except AttributeError:
pass
# 4 EDUCATION LEVEL AND HISTORY
educations = []
try:
education_div = soup.find('div', {"id": "education"})
edu_list = education_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
'ul').findAll('li',
{"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})
for each_edu in edu_list:
col_edu = each_edu.findNext("a",
{
"class": "optional-action-target-wrapper display-flex flex-column full-width"})
edu_temp = {
col_edu.findNext('span', {"class": "t-14 t-normal"}).findNext(
'span').get_text().strip(), # coursename
}
educations.append(edu_temp)
# handle the profiles which does not contain education section
except AttributeError:
pass
# 5 lICENSE AND CERTIFICATIONS
certifications = []
try:
cert_div = soup.find('div', {"id": "licenses_and_certifications"})
cert_list = cert_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
'ul').findAll('li',
{"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})
for cert in cert_list:
col_cert = cert.findNext("div", {"class": "display-flex flex-column full-width align-self-center"})
cert_temp = {
col_cert.findNext('div').findNext('span').findNext('span').text.replace('\n', '').strip()
# cert name
}
certifications.append(cert_temp)
# handle the profiles which does not contain certification section
except AttributeError:
pass
# 6 SKILLS
skills = []
try:
skills_div = soup.find('div', {"id": "skills"})
skills_list = skills_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
'ul').findAll('li',
{"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})
for skill in skills_list:
col_skill = skill.findNext("a", {"data-field": "skill_card_skill_topic"})
skill_temp = {
col_skill.findNext('div').findNext('span').findNext('span').text.replace('\n', '').strip()
# skills name
}
skills.append(skill_temp)
# handle the profiles which does not contain skills section
except AttributeError:
pass
# 7 LANGUAGES
languages = []
try:
Lang_div = soup.find('div', {"id": "languages"})
lang_list = Lang_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
'ul').findAll('li',
{"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})
for lang in lang_list:
lang_temp = {
lang.findNext("span", {"class": "mr1 t-bold"}).findNext('span').get_text().strip() # lang name
}
languages.append(lang_temp)
# handle the profiles which does not contain language section
except AttributeError:
pass
data.append({
"Name": name,
"Title": title,
"Location": location,
"Experiences": experiences,
"Education": educations,
"Certifications": certifications,
"Skills": skills,
"Languages": languages
})
driver.quit()
return data
下面是数据清理的代码:
import csv
import pandas as pd
import re
from datetime import datetime
import random
def clean_data(data):
dfCSV = pd.DataFrame(data)
# Report missing data
# dfCSV.isna().sum()
# cleaning data
dfCSV['Location'].replace('[]', 'None', inplace=True)
dfCSV['Experiences'].replace('[]', 'None', inplace=True)
dfCSV['Education'].replace('[]', 'None', inplace=True)
dfCSV['Certifications'].replace('[]', 'None', inplace=True)
dfCSV['Skills'].replace('[]', 'None', inplace=True)
dfCSV['Languages'].replace('[]', 'None', inplace=True)
# Convert all string columns to lowercase
dfCSV = dfCSV.applymap(lambda s: s.lower() if type(s) == str else s)
df = dfCSV.replace(to_replace=r'[^a-zA-Z\d,\s]', value='', regex=True)
# convert the dataframe to a list
data_clean = df.values.tolist()
print(df)
return data_clean
1条答案
按热度按时间4nkexdtk1#
您在HTML模板中没有使用正确的变量名,请使用相同的变量名循环数据并显示数据。
在display.html中替换此行
与