跳至主要內容

node爬虫爬取小说(nestjs、IP代理池)

sharebraverynodenestjsmysqltsIP代理池大约 6 分钟

项目使用 nestjs、mysql、superagent 等实现小说的爬取,并写了一个 IP 代理池。

// TODO: 引入 IP 池

// TODO: 控制并发

// TODO: 爬虫伪装

// TODO: 断点续传

// TODO: 自动抓取书籍列表

// TODO: 外部接口

服务基类 BaseService

/**
 * 服务基类
 */
// @Injectable()
export class BaseService<T> {
  constructor(private repository: Repository<T>) {}

  saveOne<TS extends DeepPartial<T>>(
    entities: TS,
    options?: SaveOptions
  ): Promise<TS> {
    return this.repository.save(entities, options);
  }

  async saveMany<TS extends DeepPartial<T>>(
    entities: TS[],
    options?: SaveOptions
  ): Promise<TS[]> {
    return this.repository.save(entities, options);
  }

  async findOne(options?: FindConditions<T>): Promise<T> {
    return this.repository.findOne(options);
  }

  async findMany(options?: FindConditions<T>): Promise<T[]> {
    return this.repository.find(options);
  }

  async findAll(): Promise<T[]> {
    return this.repository.find();
  }

  async removeOne(entity: T, options?: RemoveOptions): Promise<T> {
    return this.repository.remove(entity, options);
  }

  async removeMany(entities: T[], options?: RemoveOptions): Promise<T[]> {
    return this.repository.remove(entities, options);
  }

  async delete(options?: FindConditions<T>): Promise<DeleteResult> {
    return this.repository.delete(options);
  }

  async update(
    conditions: number | FindConditions<T>,
    newValue: QueryDeepPartialEntity<T>
  ): Promise<number> {
    let updateResult = 1;
    await this.repository
      .update(conditions, newValue)
      .catch((e) => (updateResult = 0));
    return updateResult;
  }
}

request.ts

import request = require("request-promise");

class RequestOptions {
  method?: "GET" | "POST" = "GET";
  url: string;
  encoding? = "utf8"; // 编码
  proxy: string; // 代理
  callback?: (body) => any;
}

const service = async function (options: RequestOptions) {
  const { url, method, encoding, proxy, callback } = {
    ...new RequestOptions(),
    ...options,
  };

  return request(
    {
      url: url,
      method,
      proxy,
      headers: {
        "User-Agent":
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
      },
    }
    // function (err, res, body) {
    //   //   body = iconv.decode(body, encoding);
    //   if (err) {
    //     console.log(err);
    //     return err;
    //   } else {
    //     callback(body);
    //     return body;
    //   }
    // },
  );
};

export default service;

crawl.controller.ts

@Controller("Crawl")
@ApiTags("CrawlController")
export class CrawlController {
  constructor(
    private readonly crawlService: CrawlService,
    @InjectRepository(Book)
    private booksRepository: Repository<Book>
  ) {}

  // TODO:  引入IP池
  // TODO:  控制并发
  // TODO:  爬虫伪装
  // TODO:  断点续传
  // TODO:  自动抓取书籍列表
  // TODO:  外部接口

  /**
   *启动爬虫
   *
   * @param {*} chapter
   * @param {*} callback
   * @memberof CrawlController
   */
  @ApiOperation({
    summary: "启动爬虫",
  })
  @Get("startCrawlBook")
  async startCrawlBook() {
    try {
      console.log("[ 开始抓取 ]-54");

      const books = await this.CrawlBook(BOOK_URL_LIST); // 书籍基本信息

      for (let i = 0; i < books.length; i++) {
        const book = books[i];

        // 写入书名
        await fs.writeFileSync(
          WRITE_BASEURL + `${book.name}.txt`,
          book.name + "\r\r\n"
        );

        const chapters = await this.analyticalChapterContent(book);

        book.chapters = chapters;

        book.chaptersJson = JSON.stringify(chapters);
        book.introductionItemsJson = JSON.stringify(book.introductionItems);
        book.chaptersDirectoryJson = JSON.stringify(book.chaptersDirectoryList);
        await this.crawlService.SaveBookAsync(book); // 保存书籍进入数据库
      }

      // this.writeAllFileSync(books); // 一并写入
      console.log("[ 爬虫运行完毕 ]-76");

      return books;
    } catch (error) {
      console.error(error);
      return error;
    }
  }

  /**
   *并发控制
   * @param {Array<string>} bookList
   * @return {*}
   * @memberof CrawlController
   */
  async CrawlBook(bookList: Array<string>): Promise<Book[]> {
    console.log("[ 开始抓取目录 ]-92");
    return new Promise(async (resolve, reject) => {
      const data = [];
      try {
        for (let i = 0; i < bookList.length; i++) {
          const url = bookList[i];
          const book = await this.CrawlBookBaseInfo(url);
          book.url = url;

          data.push(book);
        }

        resolve(data);

        // async.mapLimit(
        //   bookList,
        //   1,
        //   async (url, callback) => {
        //     const book = await this.CrawlBookBaseInfo(url, callback);
        //     book.url = url;
        //     return book;
        //   },
        //   (err, results) => {
        //     resolve(results);
        //   },
        // );
      } catch (error) {
        reject(error);
      }
    });
  }

  /**
   * 获取书籍基本信息、抓取目录
   * @param url
   * @param callback
   * @returns
   */
  async CrawlBookBaseInfo(url: string, callback?): Promise<Book> {
    const book = new Book();

    return new Promise(async (resolve, reject) => {
      try {
        const html = await this.crawlService.GetHtml(url);

        const $ = cheerio.load(html); // 装载页面

        // 书名
        const name = $("#novelName").text();
        book.name = name;

        // 简介
        const introduction = $(".C-Two p");
        book.introduction = introduction
          .text()
          .replace(new RegExp("飞卢小说网", "g"), "");

        introduction.each((i, el) => {
          const p = $(el).text();
          if (!p.includes("飞卢小说网")) {
            book.introductionItems.push(p);
          }
        });

        //章节目录
        const directory = $(".C-Fo-Zuo a");

        directory.each((i, el) => {
          const directory = new Directory();

          const a = $(el).attr("title");

          const url = "https:" + $(el).attr("href");

          directory.title = a;

          directory.url = url;
          if (a) book.chaptersDirectoryList.push(directory);
        });

        resolve(book);
      } catch (error) {
        reject(error);
      }
    });
  }

  /**
   *解析存入章节内容
   *
   * @param {*} chapters
   * @return {*}
   * @memberof CrawlController
   */
  analyticalChapterContent(book: Book): Promise<Chapter[]> {
    console.log("[ 当前抓取书籍 ]-185", book.name);

    const { chaptersDirectoryList } = book;

    return new Promise(async (resolve, reject) => {
      const chapters: Chapter[] = [];
      try {
        const bookInfo = await this.booksRepository.findOne(book.url); // 查询书籍

        for (let i = 0; i < 3; i++) {
          // for (let i = 0; i < chaptersDirectoryList.length; i++) {
          const item = chaptersDirectoryList[i];

          if (
            bookInfo?.chaptersJson &&
            (JSON.parse(JSON.stringify(bookInfo.chaptersJson)) as Chapter[])
              .map((o) => o.url)
              .includes(item.url)
          ) {
            console.log(item.title, "-章节已存在");

            continue; // 如果数据库内存在 则跳过
          }

          const chapter = new Chapter();

          console.log(" [  正在抓取 ]-198", item.title);

          const html = await this.crawlService.GetHtml(item.url);

          const $ = cheerio.load(html); // 装载页面

          chapter.url = item.url;

          // 章节名称
          const name = $("#novelName").text();
          const title = $(".c_l_title h1").text().replace(name, "").trim();
          chapter.title = title;

          const content = $(".noveContent");
          chapter.content = content.text(); // 存入完整字符串内容

          // 存入段落数组
          content.find("p").each((i, el) => {
            const p = $(el).text();

            chapter.contents.push(p);
          });

          chapters.push(chapter);

          chapter.contentsJson = JSON.stringify(chapter.contents);
          await this.crawlService.SaveChapterAsync(chapter); // 保存章节进入数据库

          //写入章节
          await this.crawlService.SaveChapter(
            chapter,
            WRITE_BASEURL + `${book.name}.txt`
          );
        }

        resolve(chapters);
      } catch (error) {
        reject(error);
      }
    });
  }
}

crawl.service.ts

import { Injectable } from "@nestjs/common";
import { Chapter } from "./chapter.entity";
import fs = require("fs");
import { delay, random } from "src/utils";
import request = require("superagent");

import superagentProxy = require("superagent-proxy");
superagentProxy(request);

import charset = require("superagent-charset");
charset(request);

import { Book } from "./book.entity";
import { InjectRepository } from "@nestjs/typeorm";
import { Repository } from "typeorm";

import { Proxy } from "src/proxy/proxy.entity";

@Injectable()
export class CrawlService {
  constructor(
    @InjectRepository(Book)
    private booksRepository: Repository<Book>,
    @InjectRepository(Chapter)
    private chaptersRepository: Repository<Chapter>,
    @InjectRepository(Proxy)
    private proxyRepository: Repository<Proxy>
  ) {}

  private time = 0;

  proxy = null;

  timer = null;

  timeoutNumber = 0;

  /**
   *获取html页面
   *
   * @param {string} requestUrl
   * @param {string} [requestType]
   * @return {*}  {Promise<any>}
   * @memberof CrawlController
   */
  async GetHtml(requestUrl: string): Promise<any> {
    // const ms = random(500, 2100);
    // await delay(ms); // 控制爬虫速度

    // if (this.timer) clearInterval(this.timer);

    // if (this.time === 0) {
    //   let randomIps: Proxy[] = await this.proxyRepository.find();
    //   // let randomIps: Proxy[] = await this.proxyRepository.query(
    //   //   'SELECT * FROM proxy ORDER BY  RAND() LIMIT 100;',
    //   // );

    //   // randomIps = randomIps.filter(
    //   //   (o) => o.speed < 500 && o.protocols.includes('http'),
    //   // );
    //   randomIps = randomIps.filter((o) => o.country === 'CN');
    //   this.proxy = `${randomIps[0].protocols[0]}://${randomIps[0].ip}:${randomIps[0].port}`;
    // }

    // this.timer = setInterval(() => {
    //   this.time++;
    //   if (this.time === 1000 * 60 * 30) this.time = 0;
    // }, 1000);

    this.proxy = `http://124.204.33.162:8000`;
    console.log(
      "%c [     this.proxy ]-83",
      "font-size:13px; background:pink; color:#bf2c9f;",
      this.proxy
    );

    return new Promise(async (resolve, reject) => {
      try {
        const result = await (request("GET", requestUrl) as any)
          .charset("gb2312")
          .proxy(this.proxy)
          .set("Referer", "https://b.faloo.com/")
          .buffer(true)
          .set(
            "User-Agent",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
          )
          .timeout({
            response: 50000, // Wait 5 seconds for the server to start sending,
            deadline: 60000, // but allow 1 minute for the file to finish loading.
          });

        resolve(result.text);
      } catch (err) {
        reject(err);
      }
    });
  }

  /**
   *保存书籍
   *
   * @param {Book} book
   * @memberof CrawlService
   */
  async SaveBookAsync(book: Book) {
    // console.log('正在保存书籍');
    return await this.booksRepository.save(book);
  }

  async SaveChapterAsync(chapter: Chapter) {
    // console.log('正在保存章节');
    return await this.chaptersRepository.save(chapter);
  }

  /**
   *保存章节内容
   *
   * @param {Chapter} chapter
   * @param {string} [writeUrl]
   * @memberof CrawlService
   */
  async SaveChapter(chapter: Chapter, writeUrl?: string) {
    await fs.appendFile(
      writeUrl,
      chapter.title + "\r\n" + chapter.content,
      () => {
        // console.log('写入章节:' + chapter.title);
      }
    );
  }

  /**
   *写入所有书籍
   *
   * @param {Book[]} books
   * @memberof CrawlController
   */
  async writeAllFileSync(books: Book[], writeUrl?: string) {
    for (let i = 0; i < books.length; i++) {
      const book = books[i];
      await fs.writeFileSync(writeUrl, book.name + "\r\r\n"); // 写入书名

      for (let j = 0; j < book.chapters.length; j++) {
        const chapter = book.chapters[j];
        await fs.appendFile(
          writeUrl,
          chapter.title + "\r\n" + chapter.content,
          () => {
            // console.log('写入章节:' + chapter.title);
          }
        );
      }
    }

    console.log("写入完成");
  }

  /**
   *写入单本书籍
   *
   * @param {*} book
   * @memberof CrawlController
   */
  async writeSingleFileSync(book: Book, i, writeUrl?: string) {
    if (i === 1) {
      await fs.writeFileSync(writeUrl, book.name + "\r\r\n"); // 写入书名
    }

    for (let j = 0; j < book.chapters.length; j++) {
      const chapter = book.chapters[j];
      await fs.appendFile(
        writeUrl,
        chapter.title + "\r\n" + chapter.content,
        () => {
          console.log("写入章节:" + chapter.title);
        }
      );
    }
  }
}

IP 代理池

proxy.entity.ts

import { ApiProperty } from "@nestjs/swagger";
import { Column, Entity, PrimaryColumn } from "typeorm";
import { AuditMetadata } from "../entitys/auditMetadata.entity";

@Entity()
export class Proxy extends AuditMetadata {
  @ApiProperty({ description: "IP地址" })
  @PrimaryColumn()
  ip: string;

  @ApiProperty({ description: "端口" })
  @Column()
  port: string;

  // @ApiProperty({ description: '代理位置' })
  // @Column()
  // location: string;

  @ApiProperty({ description: "country" })
  @Column()
  country: string;

  @ApiProperty({ description: "city" })
  @Column()
  city: string;

  @ApiProperty({ description: "isp" })
  @Column()
  isp: string;

  @ApiProperty({ description: "最后检查时间" })
  @Column()
  lastChecked: number;

  @ApiProperty({ description: "速度", nullable: true })
  @Column()
  speed: number;

  @ApiProperty({ description: "匿名等级" })
  @Column()
  anonymityLevel: string;

  @ApiProperty({ description: "匿名协议" })
  @Column({ type: "json", nullable: true })
  protocols: string;

  @ApiProperty({ description: "http | https", nullable: true })
  @Column({ nullable: true })
  type: ProxyType;

  @ApiProperty({ description: "是否可用" })
  @Column({ nullable: true })
  available: boolean;
}

export enum ProxyType {
  http,
  https,
}

proxy.controller.ts

// const REQUEST_URL_LIST = ['https://example.com/'];

@ApiTags("ProxyController")
@Controller("Proxy")
export class ProxyController {
  constructor(private readonly proxyService: ProxyService) {}

  @ApiOperation({
    summary: "抓取代理进入IP池",
  })
  @Get("GetProxyList")
  GetProxyList() {
    return this.proxyService.GetProxyList();
  }

  @ApiOperation({
    summary: "校验代理",
  })
  @Get("CheckProxy")
  async CheckProxy() {
    return await this.proxyService.CheckProxy();
  }
}