diff --git a/src/services/scraper.js b/src/services/scraper.js index 67cba4e..eba1212 100644 --- a/src/services/scraper.js +++ b/src/services/scraper.js @@ -1,6 +1,7 @@ const axios = require('axios'); const cheerio = require('cheerio'); const ProxyModel = require('../database/models/proxy'); +const ProxyValidator = require('./validator'); // 全局变量:标记是否有抓取任务正在进行 let isScrapingInProgress = false; @@ -17,6 +18,7 @@ class ProxyScraper { ]; this.currentProxyIndex = 0; this.localProxies = []; + this.validator = new ProxyValidator(); // 使用统一的验证器 } getRandomUserAgent() { @@ -63,21 +65,24 @@ class ProxyScraper { }; } - // 测试代理是否可用(用于抓取) + // 测试代理是否可用(用于抓取)- 使用统一的验证方法 async testProxyForScraping(proxyConfig) { - try { - const response = await axios.get('https://www.baidu.com', { - proxy: proxyConfig, - timeout: 10000, - headers: { - 'User-Agent': this.getRandomUserAgent() - }, - validateStatus: (status) => status === 200 - }); - return response.status === 200 && response.data.includes('百度'); - } catch (error) { + if (!proxyConfig || !proxyConfig.host || !proxyConfig.port) { return false; } + + const result = await this.validator.validateProxy( + proxyConfig.host, + proxyConfig.port, + { + updateDatabase: false, // 抓取时不需要更新数据库 + logResult: false, // 静默验证,不打印日志 + timeout: 10000, + userAgent: this.getRandomUserAgent() + } + ); + + return result.isValid; } // 获取可用的代理配置 @@ -147,87 +152,27 @@ class ProxyScraper { return validProxies; } - // 验证单个抓取到的代理 + // 验证单个抓取到的代理 - 使用统一的验证方法 async validateScrapedProxy(proxy, retryCount = 2) { - const testUrls = [ - 'https://www.baidu.com', - ]; - - for (let attempt = 1; attempt <= retryCount; attempt++) { - for (const testUrl of testUrls) { - const startTime = Date.now(); - - try { - const proxyConfig = { - host: proxy.ip, - port: proxy.port, - protocol: 'http' - }; - - const response = await axios.get(testUrl, { - proxy: proxyConfig, - timeout: 10000, // 3秒超时 - headers: { - 'User-Agent': this.getRandomUserAgent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Connection': 'keep-alive' - }, - validateStatus: (status) => status >= 200 && status < 300 // 接受2xx状态码 - }); - - const responseTime = Date.now() - startTime; - let isValid = false; - - // 检查响应内容 - if (response.status === 200) { - if (testUrl.includes('baidu.com')) { - isValid = response.data.includes('百度'); - } else if (testUrl.includes('httpbin.org')) { - isValid = response.data.includes('origin'); - } else if (testUrl.includes('google.com')) { - isValid = response.data.includes('google'); - } else { - isValid = true; // 对于其他URL,只要能连接就认为有效 - } - } - - if (isValid) { - console.log(`✓ 代理验证通过: ${proxy.ip}:${proxy.port} - ${testUrl} - ${responseTime}ms`); - return { - proxy: proxy, - isValid: true, - responseTime: responseTime, - error: null, - testUrl: testUrl - }; - } - - } catch (error) { - const responseTime = Date.now() - startTime; - // 如果是最后一次尝试,返回失败 - if (attempt === retryCount && testUrl === testUrls[testUrls.length - 1]) { - return { - proxy: proxy, - isValid: false, - responseTime: responseTime, - error: error.message, - testUrl: testUrl - }; - } - // 否则继续尝试下一个URL或重试 - await this.sleep(500); - } + const result = await this.validator.validateProxy( + proxy.ip, + proxy.port, + { + updateDatabase: false, // 抓取时不需要更新数据库(会在保存时统一更新) + logResult: true, // 打印验证日志 + retryCount: retryCount, // 支持重试 + retryDelay: 500, + userAgent: this.getRandomUserAgent() } - } + ); - // 所有尝试都失败 + // 转换为 scraper 期望的格式 return { proxy: proxy, - isValid: false, - responseTime: 0, - error: 'All validation attempts failed', - testUrl: null + isValid: result.isValid, + responseTime: result.responseTime, + error: result.error, + testUrl: result.testUrl }; } diff --git a/src/services/validator.js b/src/services/validator.js index c6ad2c8..a1e7656 100644 --- a/src/services/validator.js +++ b/src/services/validator.js @@ -4,68 +4,165 @@ const ProxyModel = require('../database/models/proxy'); class ProxyValidator { constructor() { this.testUrl = 'https://www.baidu.com'; - this.timeout = 10000; // 3秒超时 + this.timeout = 10000; // 10秒超时 this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; } - async validateProxy(ip, port) { + /** + * 核心统一验证代理方法 - 所有验证代理的代码都统一使用此方法 + * @param {string} ip - 代理IP地址 + * @param {number|string} port - 代理端口 + * @param {object} options - 可选配置参数 + * @param {string} options.testUrl - 测试URL,默认为百度 + * @param {number} options.timeout - 超时时间(毫秒),默认10000 + * @param {string} options.userAgent - User-Agent,默认使用类中定义的 + * @param {boolean} options.updateDatabase - 是否更新数据库,默认true + * @param {boolean} options.logResult - 是否打印日志,默认true + * @param {number} options.retryCount - 重试次数,默认0(不重试) + * @param {number} options.retryDelay - 重试延迟(毫秒),默认500 + * @returns {Promise} 验证结果 {ip, port, isValid, responseTime, error, testUrl} + */ + async validateProxy(ip, port, options = {}) { + const { + testUrl = this.testUrl, + timeout = this.timeout, + userAgent = this.userAgent, + updateDatabase = true, + logResult = true, + retryCount = 0, + retryDelay = 500 + } = options; + const startTime = Date.now(); const proxy = { host: ip, - port: port, + port: parseInt(port), // 确保端口是数字 protocol: 'http' }; - console.log(`正在验证代理 ${ip}:${port}`); + if (logResult) { + console.log(`正在验证代理 ${ip}:${port}`); + } - try { - const response = await axios.get(this.testUrl, { - proxy: proxy, - timeout: this.timeout, - headers: { - 'User-Agent': this.userAgent, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', - 'Accept-Encoding': 'gzip, deflate', - 'Connection': 'keep-alive' - }, - validateStatus: (status) => status === 200 - }); + // 支持重试机制 + let lastError = null; + let lastResult = null; - const responseTime = Date.now() - startTime; - const isValid = response.status === 200 && response.data.includes('百度'); - - if (isValid) { - console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`); - } else { - console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`); + for (let attempt = 0; attempt <= retryCount; attempt++) { + if (attempt > 0 && logResult) { + console.log(`代理 ${ip}:${port} 第 ${attempt + 1} 次重试验证...`); + await this.sleep(retryDelay); } - // 更新数据库中的验证结果 - await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime); + try { + const response = await axios.get(testUrl, { + proxy: proxy, + timeout: timeout, + headers: { + 'User-Agent': userAgent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive' + }, + validateStatus: (status) => status >= 200 && status < 300 + }); - return { - ip: ip, - port: port, - isValid: isValid, - responseTime: responseTime, - error: null - }; - } catch (error) { - const responseTime = Date.now() - startTime; - console.log(`✗ 代理 ${ip}:${port} 验证失败:`, error.message); + const responseTime = Date.now() - startTime; + let isValid = false; - // 更新数据库中的验证结果 - await ProxyModel.updateValidity(ip, port, 0, responseTime); + // 检查响应内容 - 根据不同的测试URL使用不同的验证逻辑 + if (response.status >= 200 && response.status < 300) { + if (testUrl.includes('baidu.com')) { + isValid = response.data && response.data.includes('百度'); + } else if (testUrl.includes('httpbin.org')) { + isValid = response.data && (response.data.includes('origin') || response.data.includes('ip')); + } else if (testUrl.includes('google.com')) { + isValid = response.data && response.data.toLowerCase().includes('google'); + } else { + // 对于其他URL,只要能连接就认为有效 + isValid = true; + } + } - return { - ip: ip, - port: port, - isValid: false, - responseTime: responseTime, - error: error.message - }; + if (logResult) { + if (isValid) { + console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`); + } else { + console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`); + } + } + + const result = { + ip: ip, + port: parseInt(port), + isValid: isValid, + responseTime: responseTime, + error: null, + testUrl: testUrl + }; + + // 更新数据库中的验证结果(如果需要) + if (updateDatabase) { + try { + await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime); + } catch (dbError) { + // 如果代理不在数据库中,忽略更新错误 + if (!dbError.message.includes('not found')) { + console.warn(`更新数据库失败: ${dbError.message}`); + } + } + } + + return result; + + } catch (error) { + const responseTime = Date.now() - startTime; + lastError = error; + lastResult = { + ip: ip, + port: parseInt(port), + isValid: false, + responseTime: responseTime, + error: error.message, + testUrl: testUrl + }; + + // 如果不是最后一次尝试,继续重试 + if (attempt < retryCount) { + continue; + } + + // 最后一次尝试失败 + if (logResult) { + console.log(`✗ 代理 ${ip}:${port} 验证失败: ${error.message}`); + } + + // 更新数据库中的验证结果(如果需要) + if (updateDatabase) { + try { + await ProxyModel.updateValidity(ip, port, 0, responseTime); + } catch (dbError) { + // 如果代理不在数据库中,忽略更新错误 + if (!dbError.message.includes('not found')) { + console.warn(`更新数据库失败: ${dbError.message}`); + } + } + } + + return lastResult; + } } + + // 所有重试都失败了 + return lastResult || { + ip: ip, + port: parseInt(port), + isValid: false, + responseTime: Date.now() - startTime, + error: lastError ? lastError.message : '验证失败', + testUrl: testUrl + }; } async validateSingleProxy(ip, port) {