Refactor proxy validation logic to use a unified ProxyValidator class, enhancing code maintainability and adding support for retry mechanisms and logging options.

This commit is contained in:
theluyuan 2025-10-31 08:45:16 +08:00
parent 136070e02f
commit a31fe3f892
2 changed files with 175 additions and 133 deletions

View File

@ -1,6 +1,7 @@
const axios = require('axios');
const cheerio = require('cheerio');
const ProxyModel = require('../database/models/proxy');
const ProxyValidator = require('./validator');
// 全局变量:标记是否有抓取任务正在进行
let isScrapingInProgress = false;
@ -17,6 +18,7 @@ class ProxyScraper {
];
this.currentProxyIndex = 0;
this.localProxies = [];
this.validator = new ProxyValidator(); // 使用统一的验证器
}
getRandomUserAgent() {
@ -63,21 +65,24 @@ class ProxyScraper {
};
}
// 测试代理是否可用(用于抓取)
// 测试代理是否可用(用于抓取)- 使用统一的验证方法
async testProxyForScraping(proxyConfig) {
try {
const response = await axios.get('https://www.baidu.com', {
proxy: proxyConfig,
timeout: 10000,
headers: {
'User-Agent': this.getRandomUserAgent()
},
validateStatus: (status) => status === 200
});
return response.status === 200 && response.data.includes('百度');
} catch (error) {
if (!proxyConfig || !proxyConfig.host || !proxyConfig.port) {
return false;
}
const result = await this.validator.validateProxy(
proxyConfig.host,
proxyConfig.port,
{
updateDatabase: false, // 抓取时不需要更新数据库
logResult: false, // 静默验证,不打印日志
timeout: 10000,
userAgent: this.getRandomUserAgent()
}
);
return result.isValid;
}
// 获取可用的代理配置
@ -147,87 +152,27 @@ class ProxyScraper {
return validProxies;
}
// 验证单个抓取到的代理
// 验证单个抓取到的代理 - 使用统一的验证方法
async validateScrapedProxy(proxy, retryCount = 2) {
const testUrls = [
'https://www.baidu.com',
];
for (let attempt = 1; attempt <= retryCount; attempt++) {
for (const testUrl of testUrls) {
const startTime = Date.now();
try {
const proxyConfig = {
host: proxy.ip,
port: proxy.port,
protocol: 'http'
};
const response = await axios.get(testUrl, {
proxy: proxyConfig,
timeout: 10000, // 3秒超时
headers: {
'User-Agent': this.getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive'
},
validateStatus: (status) => status >= 200 && status < 300 // 接受2xx状态码
});
const responseTime = Date.now() - startTime;
let isValid = false;
// 检查响应内容
if (response.status === 200) {
if (testUrl.includes('baidu.com')) {
isValid = response.data.includes('百度');
} else if (testUrl.includes('httpbin.org')) {
isValid = response.data.includes('origin');
} else if (testUrl.includes('google.com')) {
isValid = response.data.includes('google');
} else {
isValid = true; // 对于其他URL只要能连接就认为有效
}
}
if (isValid) {
console.log(`✓ 代理验证通过: ${proxy.ip}:${proxy.port} - ${testUrl} - ${responseTime}ms`);
return {
proxy: proxy,
isValid: true,
responseTime: responseTime,
error: null,
testUrl: testUrl
};
}
} catch (error) {
const responseTime = Date.now() - startTime;
// 如果是最后一次尝试,返回失败
if (attempt === retryCount && testUrl === testUrls[testUrls.length - 1]) {
return {
proxy: proxy,
isValid: false,
responseTime: responseTime,
error: error.message,
testUrl: testUrl
};
}
// 否则继续尝试下一个URL或重试
await this.sleep(500);
}
const result = await this.validator.validateProxy(
proxy.ip,
proxy.port,
{
updateDatabase: false, // 抓取时不需要更新数据库(会在保存时统一更新)
logResult: true, // 打印验证日志
retryCount: retryCount, // 支持重试
retryDelay: 500,
userAgent: this.getRandomUserAgent()
}
}
);
// 所有尝试都失败
// 转换为 scraper 期望的格式
return {
proxy: proxy,
isValid: false,
responseTime: 0,
error: 'All validation attempts failed',
testUrl: null
isValid: result.isValid,
responseTime: result.responseTime,
error: result.error,
testUrl: result.testUrl
};
}

View File

@ -4,68 +4,165 @@ const ProxyModel = require('../database/models/proxy');
class ProxyValidator {
constructor() {
this.testUrl = 'https://www.baidu.com';
this.timeout = 10000; // 3秒超时
this.timeout = 10000; // 10秒超时
this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
}
async validateProxy(ip, port) {
/**
* 核心统一验证代理方法 - 所有验证代理的代码都统一使用此方法
* @param {string} ip - 代理IP地址
* @param {number|string} port - 代理端口
* @param {object} options - 可选配置参数
* @param {string} options.testUrl - 测试URL默认为百度
* @param {number} options.timeout - 超时时间毫秒默认10000
* @param {string} options.userAgent - User-Agent默认使用类中定义的
* @param {boolean} options.updateDatabase - 是否更新数据库默认true
* @param {boolean} options.logResult - 是否打印日志默认true
* @param {number} options.retryCount - 重试次数默认0不重试
* @param {number} options.retryDelay - 重试延迟毫秒默认500
* @returns {Promise<object>} 验证结果 {ip, port, isValid, responseTime, error, testUrl}
*/
async validateProxy(ip, port, options = {}) {
const {
testUrl = this.testUrl,
timeout = this.timeout,
userAgent = this.userAgent,
updateDatabase = true,
logResult = true,
retryCount = 0,
retryDelay = 500
} = options;
const startTime = Date.now();
const proxy = {
host: ip,
port: port,
port: parseInt(port), // 确保端口是数字
protocol: 'http'
};
console.log(`正在验证代理 ${ip}:${port}`);
if (logResult) {
console.log(`正在验证代理 ${ip}:${port}`);
}
try {
const response = await axios.get(this.testUrl, {
proxy: proxy,
timeout: this.timeout,
headers: {
'User-Agent': this.userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
validateStatus: (status) => status === 200
});
// 支持重试机制
let lastError = null;
let lastResult = null;
const responseTime = Date.now() - startTime;
const isValid = response.status === 200 && response.data.includes('百度');
if (isValid) {
console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
} else {
console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
for (let attempt = 0; attempt <= retryCount; attempt++) {
if (attempt > 0 && logResult) {
console.log(`代理 ${ip}:${port}${attempt + 1} 次重试验证...`);
await this.sleep(retryDelay);
}
// 更新数据库中的验证结果
await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
try {
const response = await axios.get(testUrl, {
proxy: proxy,
timeout: timeout,
headers: {
'User-Agent': userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
},
validateStatus: (status) => status >= 200 && status < 300
});
return {
ip: ip,
port: port,
isValid: isValid,
responseTime: responseTime,
error: null
};
} catch (error) {
const responseTime = Date.now() - startTime;
console.log(`✗ 代理 ${ip}:${port} 验证失败:`, error.message);
const responseTime = Date.now() - startTime;
let isValid = false;
// 更新数据库中的验证结果
await ProxyModel.updateValidity(ip, port, 0, responseTime);
// 检查响应内容 - 根据不同的测试URL使用不同的验证逻辑
if (response.status >= 200 && response.status < 300) {
if (testUrl.includes('baidu.com')) {
isValid = response.data && response.data.includes('百度');
} else if (testUrl.includes('httpbin.org')) {
isValid = response.data && (response.data.includes('origin') || response.data.includes('ip'));
} else if (testUrl.includes('google.com')) {
isValid = response.data && response.data.toLowerCase().includes('google');
} else {
// 对于其他URL只要能连接就认为有效
isValid = true;
}
}
return {
ip: ip,
port: port,
isValid: false,
responseTime: responseTime,
error: error.message
};
if (logResult) {
if (isValid) {
console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
} else {
console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
}
}
const result = {
ip: ip,
port: parseInt(port),
isValid: isValid,
responseTime: responseTime,
error: null,
testUrl: testUrl
};
// 更新数据库中的验证结果(如果需要)
if (updateDatabase) {
try {
await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
} catch (dbError) {
// 如果代理不在数据库中,忽略更新错误
if (!dbError.message.includes('not found')) {
console.warn(`更新数据库失败: ${dbError.message}`);
}
}
}
return result;
} catch (error) {
const responseTime = Date.now() - startTime;
lastError = error;
lastResult = {
ip: ip,
port: parseInt(port),
isValid: false,
responseTime: responseTime,
error: error.message,
testUrl: testUrl
};
// 如果不是最后一次尝试,继续重试
if (attempt < retryCount) {
continue;
}
// 最后一次尝试失败
if (logResult) {
console.log(`✗ 代理 ${ip}:${port} 验证失败: ${error.message}`);
}
// 更新数据库中的验证结果(如果需要)
if (updateDatabase) {
try {
await ProxyModel.updateValidity(ip, port, 0, responseTime);
} catch (dbError) {
// 如果代理不在数据库中,忽略更新错误
if (!dbError.message.includes('not found')) {
console.warn(`更新数据库失败: ${dbError.message}`);
}
}
}
return lastResult;
}
}
// 所有重试都失败了
return lastResult || {
ip: ip,
port: parseInt(port),
isValid: false,
responseTime: Date.now() - startTime,
error: lastError ? lastError.message : '验证失败',
testUrl: testUrl
};
}
async validateSingleProxy(ip, port) {