Refactor proxy validation logic to use a unified ProxyValidator class, enhancing code maintainability and adding support for retry mechanisms and logging options.
This commit is contained in:
		
							parent
							
								
									136070e02f
								
							
						
					
					
						commit
						a31fe3f892
					
				@ -1,6 +1,7 @@
 | 
			
		||||
const axios = require('axios');
 | 
			
		||||
const cheerio = require('cheerio');
 | 
			
		||||
const ProxyModel = require('../database/models/proxy');
 | 
			
		||||
const ProxyValidator = require('./validator');
 | 
			
		||||
 | 
			
		||||
// 全局变量:标记是否有抓取任务正在进行
 | 
			
		||||
let isScrapingInProgress = false;
 | 
			
		||||
@ -17,6 +18,7 @@ class ProxyScraper {
 | 
			
		||||
    ];
 | 
			
		||||
    this.currentProxyIndex = 0;
 | 
			
		||||
    this.localProxies = [];
 | 
			
		||||
    this.validator = new ProxyValidator(); // 使用统一的验证器
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  getRandomUserAgent() {
 | 
			
		||||
@ -63,21 +65,24 @@ class ProxyScraper {
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // 测试代理是否可用(用于抓取)
 | 
			
		||||
  // 测试代理是否可用(用于抓取)- 使用统一的验证方法
 | 
			
		||||
  async testProxyForScraping(proxyConfig) {
 | 
			
		||||
    try {
 | 
			
		||||
      const response = await axios.get('https://www.baidu.com', {
 | 
			
		||||
        proxy: proxyConfig,
 | 
			
		||||
        timeout: 10000,
 | 
			
		||||
        headers: {
 | 
			
		||||
          'User-Agent': this.getRandomUserAgent()
 | 
			
		||||
        },
 | 
			
		||||
        validateStatus: (status) => status === 200
 | 
			
		||||
      });
 | 
			
		||||
      return response.status === 200 && response.data.includes('百度');
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
    if (!proxyConfig || !proxyConfig.host || !proxyConfig.port) {
 | 
			
		||||
      return false;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    const result = await this.validator.validateProxy(
 | 
			
		||||
      proxyConfig.host,
 | 
			
		||||
      proxyConfig.port,
 | 
			
		||||
      {
 | 
			
		||||
        updateDatabase: false, // 抓取时不需要更新数据库
 | 
			
		||||
        logResult: false, // 静默验证,不打印日志
 | 
			
		||||
        timeout: 10000,
 | 
			
		||||
        userAgent: this.getRandomUserAgent()
 | 
			
		||||
      }
 | 
			
		||||
    );
 | 
			
		||||
    
 | 
			
		||||
    return result.isValid;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // 获取可用的代理配置
 | 
			
		||||
@ -147,87 +152,27 @@ class ProxyScraper {
 | 
			
		||||
    return validProxies;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // 验证单个抓取到的代理
 | 
			
		||||
  // 验证单个抓取到的代理 - 使用统一的验证方法
 | 
			
		||||
  async validateScrapedProxy(proxy, retryCount = 2) {
 | 
			
		||||
    const testUrls = [
 | 
			
		||||
      'https://www.baidu.com',
 | 
			
		||||
    ];
 | 
			
		||||
 | 
			
		||||
    for (let attempt = 1; attempt <= retryCount; attempt++) {
 | 
			
		||||
      for (const testUrl of testUrls) {
 | 
			
		||||
        const startTime = Date.now();
 | 
			
		||||
 | 
			
		||||
        try {
 | 
			
		||||
          const proxyConfig = {
 | 
			
		||||
            host: proxy.ip,
 | 
			
		||||
            port: proxy.port,
 | 
			
		||||
            protocol: 'http'
 | 
			
		||||
          };
 | 
			
		||||
 | 
			
		||||
          const response = await axios.get(testUrl, {
 | 
			
		||||
            proxy: proxyConfig,
 | 
			
		||||
            timeout: 10000, // 3秒超时
 | 
			
		||||
            headers: {
 | 
			
		||||
              'User-Agent': this.getRandomUserAgent(),
 | 
			
		||||
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 | 
			
		||||
              'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
 | 
			
		||||
              'Connection': 'keep-alive'
 | 
			
		||||
            },
 | 
			
		||||
            validateStatus: (status) => status >= 200 && status < 300 // 接受2xx状态码
 | 
			
		||||
          });
 | 
			
		||||
 | 
			
		||||
          const responseTime = Date.now() - startTime;
 | 
			
		||||
          let isValid = false;
 | 
			
		||||
 | 
			
		||||
          // 检查响应内容
 | 
			
		||||
          if (response.status === 200) {
 | 
			
		||||
            if (testUrl.includes('baidu.com')) {
 | 
			
		||||
              isValid = response.data.includes('百度');
 | 
			
		||||
            } else if (testUrl.includes('httpbin.org')) {
 | 
			
		||||
              isValid = response.data.includes('origin');
 | 
			
		||||
            } else if (testUrl.includes('google.com')) {
 | 
			
		||||
              isValid = response.data.includes('google');
 | 
			
		||||
            } else {
 | 
			
		||||
              isValid = true; // 对于其他URL,只要能连接就认为有效
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if (isValid) {
 | 
			
		||||
            console.log(`✓ 代理验证通过: ${proxy.ip}:${proxy.port} - ${testUrl} - ${responseTime}ms`);
 | 
			
		||||
            return {
 | 
			
		||||
              proxy: proxy,
 | 
			
		||||
              isValid: true,
 | 
			
		||||
              responseTime: responseTime,
 | 
			
		||||
              error: null,
 | 
			
		||||
              testUrl: testUrl
 | 
			
		||||
            };
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
          const responseTime = Date.now() - startTime;
 | 
			
		||||
          // 如果是最后一次尝试,返回失败
 | 
			
		||||
          if (attempt === retryCount && testUrl === testUrls[testUrls.length - 1]) {
 | 
			
		||||
            return {
 | 
			
		||||
              proxy: proxy,
 | 
			
		||||
              isValid: false,
 | 
			
		||||
              responseTime: responseTime,
 | 
			
		||||
              error: error.message,
 | 
			
		||||
              testUrl: testUrl
 | 
			
		||||
            };
 | 
			
		||||
          }
 | 
			
		||||
          // 否则继续尝试下一个URL或重试
 | 
			
		||||
          await this.sleep(500);
 | 
			
		||||
        }
 | 
			
		||||
    const result = await this.validator.validateProxy(
 | 
			
		||||
      proxy.ip,
 | 
			
		||||
      proxy.port,
 | 
			
		||||
      {
 | 
			
		||||
        updateDatabase: false, // 抓取时不需要更新数据库(会在保存时统一更新)
 | 
			
		||||
        logResult: true, // 打印验证日志
 | 
			
		||||
        retryCount: retryCount, // 支持重试
 | 
			
		||||
        retryDelay: 500,
 | 
			
		||||
        userAgent: this.getRandomUserAgent()
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    // 所有尝试都失败
 | 
			
		||||
    // 转换为 scraper 期望的格式
 | 
			
		||||
    return {
 | 
			
		||||
      proxy: proxy,
 | 
			
		||||
      isValid: false,
 | 
			
		||||
      responseTime: 0,
 | 
			
		||||
      error: 'All validation attempts failed',
 | 
			
		||||
      testUrl: null
 | 
			
		||||
      isValid: result.isValid,
 | 
			
		||||
      responseTime: result.responseTime,
 | 
			
		||||
      error: result.error,
 | 
			
		||||
      testUrl: result.testUrl
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -4,68 +4,165 @@ const ProxyModel = require('../database/models/proxy');
 | 
			
		||||
class ProxyValidator {
 | 
			
		||||
  constructor() {
 | 
			
		||||
    this.testUrl = 'https://www.baidu.com';
 | 
			
		||||
    this.timeout = 10000; // 3秒超时
 | 
			
		||||
    this.timeout = 10000; // 10秒超时
 | 
			
		||||
    this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  async validateProxy(ip, port) {
 | 
			
		||||
  /**
 | 
			
		||||
   * 核心统一验证代理方法 - 所有验证代理的代码都统一使用此方法
 | 
			
		||||
   * @param {string} ip - 代理IP地址
 | 
			
		||||
   * @param {number|string} port - 代理端口
 | 
			
		||||
   * @param {object} options - 可选配置参数
 | 
			
		||||
   * @param {string} options.testUrl - 测试URL,默认为百度
 | 
			
		||||
   * @param {number} options.timeout - 超时时间(毫秒),默认10000
 | 
			
		||||
   * @param {string} options.userAgent - User-Agent,默认使用类中定义的
 | 
			
		||||
   * @param {boolean} options.updateDatabase - 是否更新数据库,默认true
 | 
			
		||||
   * @param {boolean} options.logResult - 是否打印日志,默认true
 | 
			
		||||
   * @param {number} options.retryCount - 重试次数,默认0(不重试)
 | 
			
		||||
   * @param {number} options.retryDelay - 重试延迟(毫秒),默认500
 | 
			
		||||
   * @returns {Promise<object>} 验证结果 {ip, port, isValid, responseTime, error, testUrl}
 | 
			
		||||
   */
 | 
			
		||||
  async validateProxy(ip, port, options = {}) {
 | 
			
		||||
    const {
 | 
			
		||||
      testUrl = this.testUrl,
 | 
			
		||||
      timeout = this.timeout,
 | 
			
		||||
      userAgent = this.userAgent,
 | 
			
		||||
      updateDatabase = true,
 | 
			
		||||
      logResult = true,
 | 
			
		||||
      retryCount = 0,
 | 
			
		||||
      retryDelay = 500
 | 
			
		||||
    } = options;
 | 
			
		||||
 | 
			
		||||
    const startTime = Date.now();
 | 
			
		||||
    const proxy = {
 | 
			
		||||
      host: ip,
 | 
			
		||||
      port: port,
 | 
			
		||||
      port: parseInt(port), // 确保端口是数字
 | 
			
		||||
      protocol: 'http'
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    console.log(`正在验证代理 ${ip}:${port}`);
 | 
			
		||||
    if (logResult) {
 | 
			
		||||
      console.log(`正在验证代理 ${ip}:${port}`);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    try {
 | 
			
		||||
      const response = await axios.get(this.testUrl, {
 | 
			
		||||
        proxy: proxy,
 | 
			
		||||
        timeout: this.timeout,
 | 
			
		||||
        headers: {
 | 
			
		||||
          'User-Agent': this.userAgent,
 | 
			
		||||
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 | 
			
		||||
          'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 | 
			
		||||
          'Accept-Encoding': 'gzip, deflate',
 | 
			
		||||
          'Connection': 'keep-alive'
 | 
			
		||||
        },
 | 
			
		||||
        validateStatus: (status) => status === 200
 | 
			
		||||
      });
 | 
			
		||||
    // 支持重试机制
 | 
			
		||||
    let lastError = null;
 | 
			
		||||
    let lastResult = null;
 | 
			
		||||
 | 
			
		||||
      const responseTime = Date.now() - startTime;
 | 
			
		||||
      const isValid = response.status === 200 && response.data.includes('百度');
 | 
			
		||||
 | 
			
		||||
      if (isValid) {
 | 
			
		||||
        console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
 | 
			
		||||
      } else {
 | 
			
		||||
        console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
 | 
			
		||||
    for (let attempt = 0; attempt <= retryCount; attempt++) {
 | 
			
		||||
      if (attempt > 0 && logResult) {
 | 
			
		||||
        console.log(`代理 ${ip}:${port} 第 ${attempt + 1} 次重试验证...`);
 | 
			
		||||
        await this.sleep(retryDelay);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // 更新数据库中的验证结果
 | 
			
		||||
      await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
 | 
			
		||||
      try {
 | 
			
		||||
        const response = await axios.get(testUrl, {
 | 
			
		||||
          proxy: proxy,
 | 
			
		||||
          timeout: timeout,
 | 
			
		||||
          headers: {
 | 
			
		||||
            'User-Agent': userAgent,
 | 
			
		||||
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 | 
			
		||||
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 | 
			
		||||
            'Accept-Encoding': 'gzip, deflate',
 | 
			
		||||
            'Connection': 'keep-alive'
 | 
			
		||||
          },
 | 
			
		||||
          validateStatus: (status) => status >= 200 && status < 300
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
      return {
 | 
			
		||||
        ip: ip,
 | 
			
		||||
        port: port,
 | 
			
		||||
        isValid: isValid,
 | 
			
		||||
        responseTime: responseTime,
 | 
			
		||||
        error: null
 | 
			
		||||
      };
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      const responseTime = Date.now() - startTime;
 | 
			
		||||
      console.log(`✗ 代理 ${ip}:${port} 验证失败:`, error.message);
 | 
			
		||||
        const responseTime = Date.now() - startTime;
 | 
			
		||||
        let isValid = false;
 | 
			
		||||
 | 
			
		||||
      // 更新数据库中的验证结果
 | 
			
		||||
      await ProxyModel.updateValidity(ip, port, 0, responseTime);
 | 
			
		||||
        // 检查响应内容 - 根据不同的测试URL使用不同的验证逻辑
 | 
			
		||||
        if (response.status >= 200 && response.status < 300) {
 | 
			
		||||
          if (testUrl.includes('baidu.com')) {
 | 
			
		||||
            isValid = response.data && response.data.includes('百度');
 | 
			
		||||
          } else if (testUrl.includes('httpbin.org')) {
 | 
			
		||||
            isValid = response.data && (response.data.includes('origin') || response.data.includes('ip'));
 | 
			
		||||
          } else if (testUrl.includes('google.com')) {
 | 
			
		||||
            isValid = response.data && response.data.toLowerCase().includes('google');
 | 
			
		||||
          } else {
 | 
			
		||||
            // 对于其他URL,只要能连接就认为有效
 | 
			
		||||
            isValid = true;
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      return {
 | 
			
		||||
        ip: ip,
 | 
			
		||||
        port: port,
 | 
			
		||||
        isValid: false,
 | 
			
		||||
        responseTime: responseTime,
 | 
			
		||||
        error: error.message
 | 
			
		||||
      };
 | 
			
		||||
        if (logResult) {
 | 
			
		||||
          if (isValid) {
 | 
			
		||||
            console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
 | 
			
		||||
          } else {
 | 
			
		||||
            console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        const result = {
 | 
			
		||||
          ip: ip,
 | 
			
		||||
          port: parseInt(port),
 | 
			
		||||
          isValid: isValid,
 | 
			
		||||
          responseTime: responseTime,
 | 
			
		||||
          error: null,
 | 
			
		||||
          testUrl: testUrl
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        // 更新数据库中的验证结果(如果需要)
 | 
			
		||||
        if (updateDatabase) {
 | 
			
		||||
          try {
 | 
			
		||||
            await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
 | 
			
		||||
          } catch (dbError) {
 | 
			
		||||
            // 如果代理不在数据库中,忽略更新错误
 | 
			
		||||
            if (!dbError.message.includes('not found')) {
 | 
			
		||||
              console.warn(`更新数据库失败: ${dbError.message}`);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return result;
 | 
			
		||||
 | 
			
		||||
      } catch (error) {
 | 
			
		||||
        const responseTime = Date.now() - startTime;
 | 
			
		||||
        lastError = error;
 | 
			
		||||
        lastResult = {
 | 
			
		||||
          ip: ip,
 | 
			
		||||
          port: parseInt(port),
 | 
			
		||||
          isValid: false,
 | 
			
		||||
          responseTime: responseTime,
 | 
			
		||||
          error: error.message,
 | 
			
		||||
          testUrl: testUrl
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        // 如果不是最后一次尝试,继续重试
 | 
			
		||||
        if (attempt < retryCount) {
 | 
			
		||||
          continue;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // 最后一次尝试失败
 | 
			
		||||
        if (logResult) {
 | 
			
		||||
          console.log(`✗ 代理 ${ip}:${port} 验证失败: ${error.message}`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // 更新数据库中的验证结果(如果需要)
 | 
			
		||||
        if (updateDatabase) {
 | 
			
		||||
          try {
 | 
			
		||||
            await ProxyModel.updateValidity(ip, port, 0, responseTime);
 | 
			
		||||
          } catch (dbError) {
 | 
			
		||||
            // 如果代理不在数据库中,忽略更新错误
 | 
			
		||||
            if (!dbError.message.includes('not found')) {
 | 
			
		||||
              console.warn(`更新数据库失败: ${dbError.message}`);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return lastResult;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // 所有重试都失败了
 | 
			
		||||
    return lastResult || {
 | 
			
		||||
      ip: ip,
 | 
			
		||||
      port: parseInt(port),
 | 
			
		||||
      isValid: false,
 | 
			
		||||
      responseTime: Date.now() - startTime,
 | 
			
		||||
      error: lastError ? lastError.message : '验证失败',
 | 
			
		||||
      testUrl: testUrl
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  async validateSingleProxy(ip, port) {
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user