Typescript-爬虫案例

x33g5p2x  于2022-03-06 转载在 其他  
字(8.0k)|赞(0)|评价(0)|浏览(593)

前言

首先初始化两个文件

js配置文件

  1. npm init -y

生成

  1. {
  2. "name": "typescript",
  3. "version": "1.0.0",
  4. "description": "",
  5. "main": "index.js",
  6. "scripts": {
  7. "dev:build": "tsc -w",
  8. "dev:start": "nodemon node ./build/crowller.js",
  9. "dev": "concurrently npm:dev:*"
  10. },
  11. "nodemonConfig": {
  12. "ignore": [
  13. "data/*"
  14. ]
  15. },
  16. "keywords": [],
  17. "author": "",
  18. "license": "ISC",
  19. "devDependencies": {
  20. "@types/cheerio": "^0.22.14",
  21. "@types/superagent": "^4.1.4",
  22. "concurrently": "^5.0.0",
  23. "nodemon": "^2.0.1",
  24. "ts-node": "^8.5.2",
  25. "typescript": "^3.7.2"
  26. },
  27. "dependencies": {
  28. "cheerio": "^1.0.0-rc.3",
  29. "superagent": "^5.1.1"
  30. }
  31. }

TS配置文件

  1. tsc --init
  1. {
  2. "compilerOptions": {
  3. /* Basic Options */
  4. // "incremental": true, /* Enable incremental compilation */
  5. "target": "es5", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */
  6. "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */
  7. // "lib": [], /* Specify library files to be included in the compilation. */
  8. // "allowJs": true, /* Allow javascript files to be compiled. */
  9. // "checkJs": true, /* Report errors in .js files. */
  10. // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
  11. // "declaration": true, /* Generates corresponding '.d.ts' file. */
  12. // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
  13. // "sourceMap": true, /* Generates corresponding '.map' file. */
  14. // "outFile": "./", /* Concatenate and emit output to single file. */
  15. // "outDir": "./", /* Redirect output structure to the directory. */
  16. // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
  17. // "composite": true, /* Enable project compilation */
  18. // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
  19. // "removeComments": true, /* Do not emit comments to output. */
  20. // "noEmit": true, /* Do not emit outputs. */
  21. // "importHelpers": true, /* Import emit helpers from 'tslib'. */
  22. // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
  23. // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
  24. /* Strict Type-Checking Options */
  25. "strict": true, /* Enable all strict type-checking options. */
  26. // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
  27. // "strictNullChecks": true, /* Enable strict null checks. */
  28. // "strictFunctionTypes": true, /* Enable strict checking of function types. */
  29. // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
  30. // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
  31. // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
  32. // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */
  33. /* Additional Checks */
  34. // "noUnusedLocals": true, /* Report errors on unused locals. */
  35. // "noUnusedParameters": true, /* Report errors on unused parameters. */
  36. // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
  37. // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
  38. /* Module Resolution Options */
  39. // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
  40. // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */
  41. // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
  42. // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
  43. // "typeRoots": [], /* List of folders to include type definitions from. */
  44. // "types": [], /* Type declaration files to be included in compilation. */
  45. // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
  46. "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */
  47. // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
  48. // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
  49. /* Source Map Options */
  50. // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
  51. // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
  52. // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
  53. // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
  54. /* Experimental Options */
  55. // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
  56. // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */
  57. }
  58. }

在安装

  1. npm install -D ts-node
  2. npm install -D typescript

提示:以下是本篇文章正文内容,下面案例可供参考

一、自建网页爬取

页面地址:http://www.dell-lee.com/typescript/demo.html?secret=secretKey

二、爬虫代码

crowller.ts

  1. import fs from 'fs';
  2. import path from 'path';
  3. import superagent from 'superagent';
  4. import LeeAnalyzer from './leeAnalyzer';
  5. export interface Analyzer {
  6. analyze: (html: string, filePath: string) => string;
  7. }
  8. class Crowller {
  9. private filePath = path.resolve(__dirname, '../data/course.json');
  10. async getRawHtml() {
  11. const result = await superagent.get(this.url);
  12. return result.text;
  13. }
  14. writeFile(content: string) {
  15. fs.writeFileSync(this.filePath, content);
  16. }
  17. async initSpiderProcess() {
  18. const html = await this.getRawHtml();
  19. const fileContent = this.analyzer.analyze(html, this.filePath);
  20. this.writeFile(fileContent);
  21. }
  22. constructor(private url: string, private analyzer: Analyzer) {
  23. this.initSpiderProcess();
  24. }
  25. }
  26. const secret = 'secretKey';
  27. const url = `http://www.dell-lee.com/typescript/demo.html?secret=${secret}`;
  28. const analyzer = new LeeAnalyzer();
  29. new Crowller(url, analyzer);

dellAnalyzer.ts

  1. import fs from 'fs';
  2. import cheerio from 'cheerio';
  3. import { Analyzer } from './crowller';
  4. interface Course {
  5. title: string;
  6. count: number;
  7. }
  8. interface CourseResult {
  9. time: number;
  10. data: Course[];
  11. }
  12. interface Content {
  13. [propName: number]: Course[];
  14. }
  15. export default class DellAnalyzer implements Analyzer {
  16. private getCourseInfo(html: string) {
  17. const $ = cheerio.load(html);
  18. const courseItems = $('.course-item');
  19. const courseInfos: Course[] = [];
  20. courseItems.map((index, element) => {
  21. const descs = $(element).find('.course-desc');
  22. const title = descs.eq(0).text();
  23. const count = parseInt(
  24. descs
  25. .eq(1)
  26. .text()
  27. .split(':')[1],
  28. 10
  29. );
  30. courseInfos.push({ title, count });
  31. });
  32. return {
  33. time: new Date().getTime(),
  34. data: courseInfos
  35. };
  36. }
  37. generateJsonContent(courseInfo: CourseResult, filePath: string) {
  38. let fileContent: Content = {};
  39. if (fs.existsSync(filePath)) {
  40. fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
  41. }
  42. fileContent[courseInfo.time] = courseInfo.data;
  43. return fileContent;
  44. }
  45. public analyze(html: string, filePath: string) {
  46. const courseInfo = this.getCourseInfo(html);
  47. const fileContent = this.generateJsonContent(courseInfo, filePath);
  48. return JSON.stringify(fileContent);
  49. }
  50. }

leeAnalyzer.ts

  1. import { Analyzer } from './crowller';
  2. export default class LeeAnalyzer implements Analyzer {
  3. public analyze(html: string, filePath: string) {
  4. return html;
  5. }
  6. }

三、执行代码后得到

得到json

  1. {
  2. time: 1640504759317,
  3. data: [
  4. { title: 'Vue2.5开发去哪儿网App 从零基础入门到实战项目', count: NaN },
  5. { title: 'React 16.4 开发简书项目 从零基础入门到实战', count: NaN },
  6. { title: 'Vue2.5开发去哪儿网App 从零基础入门到实战项目', count: NaN },
  7. { title: '从基础到实战\t手把手带你掌握新版Webpack4.0', count: NaN },
  8. { title: 'Dell Lee 的微课堂,职业规划答疑解惑,精彩文章与你分享', count: NaN }
  9. ]
  10. }

相关文章