pandas 在刮取后的下一个html页面中显示清理数据

我想链接的Python文件和HTML页面使用 flask 应用程序。第一页（upload.html页面），用户需要从两个下拉菜单中选择工作字段和职位。然后，用户需要上传包含URL列表的文件，这些URL将在用户单击按钮上传后使用自动编码进行抓取。问题是我只成功地显示了那些不干净的数据。
下面是上传页面广告展示页面的HTML代码片段代码：

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>UPLOAD FILE</title>
    <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" />
</head>
<body>
    <div class="wrapper2">
        <nav class="navbar">
            <ul>
                <li><a href="#">HOME</a></li>
                <li><a href="#">UPLOAD FILE</a></li>
                <li><a href="#">DETAILS</a></li>
                <li><a href="#">CLASSIFICATION</a></li>
                <li><a href="#">SHORTLISTED</a></li>
                <li><a href="#">ANALYSIS</a></li>
            </ul>
        </nav>
        <div class="job">
            <h2>Choose job's field: </h2>
            <select name="job-field" id="job-field">
                <option value="" selected="selected">Select field</option>
            </select>
              <br><br>
            <h2>Choose job's title: </h2>
            <select name="job-title" id="job-title">
                <option value="" selected="selected">Select title</option>
            </select>
        </div>

        <div class="upload">
            <h2>Upload a CSV file: </h2>
            <form action="/upload" method="post" enctype="multipart/form-data">
              <input type="file" name="file">
              <br><br>
              <button type="submit">Upload</button>
            </form>
        </div>
    </div>
    <script src="{{ url_for('static', filename='script.js') }}"></script>
</body>
</html>

<!DOCTYPE html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>Scraped Information</title>
    <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
  </head>
  <body>
    <nav class="navbar">
        <ul>
            <li><a href="#">HOME</a></li>
            <li><a href="#">UPLOAD FILE</a></li>
            <li><a href="#">DETAILS</a></li>
            <li><a href="#">CLASSIFICATION</a></li>
            <li><a href="#">SHORTLISTED</a></li>
            <li><a href="#">ANALYSIS</a></li>
        </ul>
    </nav>
    <h1>{{ job_field }}</h1>
    {% if job_field == 'IT' %}
    <table>
      <thead>
        <tr>
          <th>Job's Title</th>
          <th>Location</th>
          <th>Experiences</th>
          <th>Education</th>
          <th>Certifications</th>
          <th>Skills</th>
          <th>Languages</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Senior FrontEnd Developer</td>
          <td>No specific</td>
          <td>No specific</td>
          <td>Bachelor Degree in Software Engineering, Computer Science or equivalent</td>
          <td>No specific</td>
          <td>html,css,sass,js,gulp,webpack,jquery,vuejs,vuex,inertiajs,laravel,php,</td>
          <td>No specific</td>
        </tr>
      </tbody>
    </table>
    {% elif job_field == 'DM' %}
    <p>Details for Digital Marketing go here...</p>
    {% elif job_field == 'FN' %}
    <p>Details for Finance go here...</p>
    {% endif %}

    <h2>Scraped Information</h2>
    <table>
      <thead>
        <tr>
          <th>Name</th>
          <th>Title</th>
          <th>Location</th>
          <th>Experiences</th>
          <th>Education</th>
          <th>Certifications</th>
          <th>Skills</th>
          <th>Languages</th>
        </tr>
      </thead>
      <tbody>
        {% for item in data %}
        <tr>
          <td>{{ item["Name"] }}</td>
          <td>{{ item["Title"] }}</td>
          <td>{{ item["Location"] }}</td>
          <td>{{ item["Experiences"] }}</td>
          <td>{{ item["Education"] }}</td>
          <td>{{ item["Certifications"] }}</td>
          <td>{{ item["Skills"] }}</td>
          <td>{{ item["Languages"] }}</td>
        </tr>
        {% endfor %}
      </tbody>
    </table>
  </body>
</html>

下面是flask应用程序的代码：

from flask import Flask, render_template, request, redirect, url_for
import pandas as pd
from scraper import scrape_data
from data_cleaning import clean_data

app = Flask(__name__)
app.static_folder = 'static'

@app.route('/')
def home():
    return render_template('home_page.html')

@app.route('/upload_page')
def upload_page():
    return render_template('upload_page.html')

@app.route('/process', methods=['POST'])
def process():
    job_field = request.form['job_field']
    return redirect(url_for('display', job_field=job_field))

@app.route('/upload', methods=['POST'])
def upload():
    file = request.files['file']
    df = pd.read_csv(file)
    data = scrape_data(df)
    cleandata = clean_data(data)
    return render_template('display.html', data=cleandata)

if __name__ == '__main__':
    app.run(debug=True)

下面是scraping的代码：

def scrape_data(df):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome()

    sleep(2)
    url = 'https://www.linkedin.com/login'
    driver.get(url)
    sleep(2)

    # Task 1.2: Import username and password
    username = "rinarinza@gmail.com"
    password = "Zahra2011"
    print('- Finish importing the login credentials')
    sleep(2)

    # Task 1.2: Key in login credentials
    email_field = driver.find_element(By.ID, 'username')
    email_field.send_keys(username)

    password_field = driver.find_element(By.NAME, 'session_password')
    password_field.send_keys(password)
    sleep(2)

    # Task 1.2: Click the Login button
    signin_field = driver.find_element(By.XPATH, '//*[@type="submit"]')
    signin_field.click()
    sleep(3)
    df.to_csv('mydata.csv', index=False)
    data = []
    with open('mydata.csv', 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for url in csvreader:
            # Task 1: Login to Linkedin
            # sleep(2)
            profile_url = url[0]
            driver.get(profile_url)
            start = time.time()
            initialScroll = 0
            finalScroll = 1000

            while True:
                driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
                # this command scrolls the window starting from the pixel value stored in the initialScroll
                # variable to the pixel value stored at the finalScroll variable
                initialScroll = finalScroll
                finalScroll += 1000

                # we will stop the script for 3 seconds so that the data can load
                time.sleep(2)
                end = time.time()
                # We will scroll for 20 seconds.
                if round(end - start) > 20:
                    break

            src = driver.page_source
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # print(soup.prettify())

            # 1 NAME
            intro = soup.find('div', {'class': 'pv-text-details__left-panel'})
            name_loc = intro.find("h1")
            name = name_loc.get_text().strip()

            # Title
            title = soup.find('div', {"class": "text-body-medium break-words"}).get_text().replace('\n', '').strip()

            # 2 LOCATION
            loc = soup.find('span', {'class': 'text-body-small inline t-black--light break-words'})
            location = loc.get_text().strip()

            # 3 EXPERIENCE
            experiences = []
            try:
                experience_div = soup.find('div', {"id": "experience"})
                exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
                    'ul').findAll('li',
                                  {"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})

                for each_exp in exp_list:
                    exp_temp = {
                        each_exp.findNext('span', {"class": "t-14 t-normal t-black--light"}).findNext(
                            'span').get_text().replace('\n', '').strip()  # timeframe
                    }

                    experiences.append(exp_temp)

            # handle the profiles which does not contain experience section
            except AttributeError:
                pass

            # 4 EDUCATION LEVEL AND HISTORY
            educations = []
            try:
                education_div = soup.find('div', {"id": "education"})
                edu_list = education_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
                    'ul').findAll('li',
                                  {"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})

                for each_edu in edu_list:
                    col_edu = each_edu.findNext("a",
                                                {
                                                    "class": "optional-action-target-wrapper display-flex flex-column full-width"})

                    edu_temp = {
                        col_edu.findNext('span', {"class": "t-14 t-normal"}).findNext(
                            'span').get_text().strip(),  # coursename
                    }

                    educations.append(edu_temp)

            # handle the profiles which does not contain education section
            except AttributeError:
                pass

            # 5 lICENSE AND CERTIFICATIONS
            certifications = []
            try:
                cert_div = soup.find('div', {"id": "licenses_and_certifications"})
                cert_list = cert_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
                    'ul').findAll('li',
                                  {"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})

                for cert in cert_list:
                    col_cert = cert.findNext("div", {"class": "display-flex flex-column full-width align-self-center"})
                    cert_temp = {
                        col_cert.findNext('div').findNext('span').findNext('span').text.replace('\n', '').strip()
                        # cert name
                    }

                    certifications.append(cert_temp)

            # handle the profiles which does not contain certification section
            except AttributeError:
                pass

            # 6 SKILLS
            skills = []
            try:
                skills_div = soup.find('div', {"id": "skills"})
                skills_list = skills_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
                    'ul').findAll('li',
                                  {"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})

                for skill in skills_list:
                    col_skill = skill.findNext("a", {"data-field": "skill_card_skill_topic"})
                    skill_temp = {
                        col_skill.findNext('div').findNext('span').findNext('span').text.replace('\n', '').strip()
                        # skills name
                    }
                    skills.append(skill_temp)

            # handle the profiles which does not contain skills section
            except AttributeError:
                pass

            # 7 LANGUAGES
            languages = []
            try:
                Lang_div = soup.find('div', {"id": "languages"})
                lang_list = Lang_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild(
                    'ul').findAll('li',
                                  {"class": "artdeco-list__item pvs-list__item--line-separated pvs-list__item--one-column"})

                for lang in lang_list:
                    lang_temp = {
                        lang.findNext("span", {"class": "mr1 t-bold"}).findNext('span').get_text().strip()  # lang name
                    }
                    languages.append(lang_temp)

            # handle the profiles which does not contain language section
            except AttributeError:
                pass

            data.append({
                "Name": name,
                "Title": title,
                "Location": location,
                "Experiences": experiences,
                "Education": educations,
                "Certifications": certifications,
                "Skills": skills,
                "Languages": languages
            })

    driver.quit()

    return data

下面是数据清理的代码：

import csv
import pandas as pd
import re
from datetime import datetime
import random

def clean_data(data):
    dfCSV = pd.DataFrame(data)

    # Report missing data
    # dfCSV.isna().sum()

    # cleaning data
    dfCSV['Location'].replace('[]', 'None', inplace=True)
    dfCSV['Experiences'].replace('[]', 'None', inplace=True)
    dfCSV['Education'].replace('[]', 'None', inplace=True)
    dfCSV['Certifications'].replace('[]', 'None', inplace=True)
    dfCSV['Skills'].replace('[]', 'None', inplace=True)
    dfCSV['Languages'].replace('[]', 'None', inplace=True)

    # Convert all string columns to lowercase
    dfCSV = dfCSV.applymap(lambda s: s.lower() if type(s) == str else s)

    df = dfCSV.replace(to_replace=r'[^a-zA-Z\d,\s]', value='', regex=True)

    # convert the dataframe to a list
    data_clean = df.values.tolist()

    print(df)
    return data_clean

pandas 在刮取后的下一个html页面中显示清理数据

1条答案

相关问题

热门标签

最新问答