Refactor proxy validation logic to use a unified ProxyValidator class, enhancing code maintainability and adding support for retry mechanisms and logging options.
This commit is contained in:
parent
136070e02f
commit
a31fe3f892
@ -1,6 +1,7 @@
|
||||
const axios = require('axios');
|
||||
const cheerio = require('cheerio');
|
||||
const ProxyModel = require('../database/models/proxy');
|
||||
const ProxyValidator = require('./validator');
|
||||
|
||||
// 全局变量:标记是否有抓取任务正在进行
|
||||
let isScrapingInProgress = false;
|
||||
@ -17,6 +18,7 @@ class ProxyScraper {
|
||||
];
|
||||
this.currentProxyIndex = 0;
|
||||
this.localProxies = [];
|
||||
this.validator = new ProxyValidator(); // 使用统一的验证器
|
||||
}
|
||||
|
||||
getRandomUserAgent() {
|
||||
@ -63,21 +65,24 @@ class ProxyScraper {
|
||||
};
|
||||
}
|
||||
|
||||
// 测试代理是否可用(用于抓取)
|
||||
// 测试代理是否可用(用于抓取)- 使用统一的验证方法
|
||||
async testProxyForScraping(proxyConfig) {
|
||||
try {
|
||||
const response = await axios.get('https://www.baidu.com', {
|
||||
proxy: proxyConfig,
|
||||
timeout: 10000,
|
||||
headers: {
|
||||
'User-Agent': this.getRandomUserAgent()
|
||||
},
|
||||
validateStatus: (status) => status === 200
|
||||
});
|
||||
return response.status === 200 && response.data.includes('百度');
|
||||
} catch (error) {
|
||||
if (!proxyConfig || !proxyConfig.host || !proxyConfig.port) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const result = await this.validator.validateProxy(
|
||||
proxyConfig.host,
|
||||
proxyConfig.port,
|
||||
{
|
||||
updateDatabase: false, // 抓取时不需要更新数据库
|
||||
logResult: false, // 静默验证,不打印日志
|
||||
timeout: 10000,
|
||||
userAgent: this.getRandomUserAgent()
|
||||
}
|
||||
);
|
||||
|
||||
return result.isValid;
|
||||
}
|
||||
|
||||
// 获取可用的代理配置
|
||||
@ -147,87 +152,27 @@ class ProxyScraper {
|
||||
return validProxies;
|
||||
}
|
||||
|
||||
// 验证单个抓取到的代理
|
||||
// 验证单个抓取到的代理 - 使用统一的验证方法
|
||||
async validateScrapedProxy(proxy, retryCount = 2) {
|
||||
const testUrls = [
|
||||
'https://www.baidu.com',
|
||||
];
|
||||
|
||||
for (let attempt = 1; attempt <= retryCount; attempt++) {
|
||||
for (const testUrl of testUrls) {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const proxyConfig = {
|
||||
host: proxy.ip,
|
||||
port: proxy.port,
|
||||
protocol: 'http'
|
||||
};
|
||||
|
||||
const response = await axios.get(testUrl, {
|
||||
proxy: proxyConfig,
|
||||
timeout: 10000, // 3秒超时
|
||||
headers: {
|
||||
'User-Agent': this.getRandomUserAgent(),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Connection': 'keep-alive'
|
||||
},
|
||||
validateStatus: (status) => status >= 200 && status < 300 // 接受2xx状态码
|
||||
});
|
||||
|
||||
const responseTime = Date.now() - startTime;
|
||||
let isValid = false;
|
||||
|
||||
// 检查响应内容
|
||||
if (response.status === 200) {
|
||||
if (testUrl.includes('baidu.com')) {
|
||||
isValid = response.data.includes('百度');
|
||||
} else if (testUrl.includes('httpbin.org')) {
|
||||
isValid = response.data.includes('origin');
|
||||
} else if (testUrl.includes('google.com')) {
|
||||
isValid = response.data.includes('google');
|
||||
} else {
|
||||
isValid = true; // 对于其他URL,只要能连接就认为有效
|
||||
}
|
||||
}
|
||||
|
||||
if (isValid) {
|
||||
console.log(`✓ 代理验证通过: ${proxy.ip}:${proxy.port} - ${testUrl} - ${responseTime}ms`);
|
||||
return {
|
||||
proxy: proxy,
|
||||
isValid: true,
|
||||
responseTime: responseTime,
|
||||
error: null,
|
||||
testUrl: testUrl
|
||||
};
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
const responseTime = Date.now() - startTime;
|
||||
// 如果是最后一次尝试,返回失败
|
||||
if (attempt === retryCount && testUrl === testUrls[testUrls.length - 1]) {
|
||||
return {
|
||||
proxy: proxy,
|
||||
isValid: false,
|
||||
responseTime: responseTime,
|
||||
error: error.message,
|
||||
testUrl: testUrl
|
||||
};
|
||||
}
|
||||
// 否则继续尝试下一个URL或重试
|
||||
await this.sleep(500);
|
||||
}
|
||||
const result = await this.validator.validateProxy(
|
||||
proxy.ip,
|
||||
proxy.port,
|
||||
{
|
||||
updateDatabase: false, // 抓取时不需要更新数据库(会在保存时统一更新)
|
||||
logResult: true, // 打印验证日志
|
||||
retryCount: retryCount, // 支持重试
|
||||
retryDelay: 500,
|
||||
userAgent: this.getRandomUserAgent()
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// 所有尝试都失败
|
||||
// 转换为 scraper 期望的格式
|
||||
return {
|
||||
proxy: proxy,
|
||||
isValid: false,
|
||||
responseTime: 0,
|
||||
error: 'All validation attempts failed',
|
||||
testUrl: null
|
||||
isValid: result.isValid,
|
||||
responseTime: result.responseTime,
|
||||
error: result.error,
|
||||
testUrl: result.testUrl
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -4,68 +4,165 @@ const ProxyModel = require('../database/models/proxy');
|
||||
class ProxyValidator {
|
||||
constructor() {
|
||||
this.testUrl = 'https://www.baidu.com';
|
||||
this.timeout = 10000; // 3秒超时
|
||||
this.timeout = 10000; // 10秒超时
|
||||
this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
|
||||
}
|
||||
|
||||
async validateProxy(ip, port) {
|
||||
/**
|
||||
* 核心统一验证代理方法 - 所有验证代理的代码都统一使用此方法
|
||||
* @param {string} ip - 代理IP地址
|
||||
* @param {number|string} port - 代理端口
|
||||
* @param {object} options - 可选配置参数
|
||||
* @param {string} options.testUrl - 测试URL,默认为百度
|
||||
* @param {number} options.timeout - 超时时间(毫秒),默认10000
|
||||
* @param {string} options.userAgent - User-Agent,默认使用类中定义的
|
||||
* @param {boolean} options.updateDatabase - 是否更新数据库,默认true
|
||||
* @param {boolean} options.logResult - 是否打印日志,默认true
|
||||
* @param {number} options.retryCount - 重试次数,默认0(不重试)
|
||||
* @param {number} options.retryDelay - 重试延迟(毫秒),默认500
|
||||
* @returns {Promise<object>} 验证结果 {ip, port, isValid, responseTime, error, testUrl}
|
||||
*/
|
||||
async validateProxy(ip, port, options = {}) {
|
||||
const {
|
||||
testUrl = this.testUrl,
|
||||
timeout = this.timeout,
|
||||
userAgent = this.userAgent,
|
||||
updateDatabase = true,
|
||||
logResult = true,
|
||||
retryCount = 0,
|
||||
retryDelay = 500
|
||||
} = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
const proxy = {
|
||||
host: ip,
|
||||
port: port,
|
||||
port: parseInt(port), // 确保端口是数字
|
||||
protocol: 'http'
|
||||
};
|
||||
|
||||
console.log(`正在验证代理 ${ip}:${port}`);
|
||||
if (logResult) {
|
||||
console.log(`正在验证代理 ${ip}:${port}`);
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await axios.get(this.testUrl, {
|
||||
proxy: proxy,
|
||||
timeout: this.timeout,
|
||||
headers: {
|
||||
'User-Agent': this.userAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive'
|
||||
},
|
||||
validateStatus: (status) => status === 200
|
||||
});
|
||||
// 支持重试机制
|
||||
let lastError = null;
|
||||
let lastResult = null;
|
||||
|
||||
const responseTime = Date.now() - startTime;
|
||||
const isValid = response.status === 200 && response.data.includes('百度');
|
||||
|
||||
if (isValid) {
|
||||
console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
|
||||
} else {
|
||||
console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
|
||||
for (let attempt = 0; attempt <= retryCount; attempt++) {
|
||||
if (attempt > 0 && logResult) {
|
||||
console.log(`代理 ${ip}:${port} 第 ${attempt + 1} 次重试验证...`);
|
||||
await this.sleep(retryDelay);
|
||||
}
|
||||
|
||||
// 更新数据库中的验证结果
|
||||
await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
|
||||
try {
|
||||
const response = await axios.get(testUrl, {
|
||||
proxy: proxy,
|
||||
timeout: timeout,
|
||||
headers: {
|
||||
'User-Agent': userAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive'
|
||||
},
|
||||
validateStatus: (status) => status >= 200 && status < 300
|
||||
});
|
||||
|
||||
return {
|
||||
ip: ip,
|
||||
port: port,
|
||||
isValid: isValid,
|
||||
responseTime: responseTime,
|
||||
error: null
|
||||
};
|
||||
} catch (error) {
|
||||
const responseTime = Date.now() - startTime;
|
||||
console.log(`✗ 代理 ${ip}:${port} 验证失败:`, error.message);
|
||||
const responseTime = Date.now() - startTime;
|
||||
let isValid = false;
|
||||
|
||||
// 更新数据库中的验证结果
|
||||
await ProxyModel.updateValidity(ip, port, 0, responseTime);
|
||||
// 检查响应内容 - 根据不同的测试URL使用不同的验证逻辑
|
||||
if (response.status >= 200 && response.status < 300) {
|
||||
if (testUrl.includes('baidu.com')) {
|
||||
isValid = response.data && response.data.includes('百度');
|
||||
} else if (testUrl.includes('httpbin.org')) {
|
||||
isValid = response.data && (response.data.includes('origin') || response.data.includes('ip'));
|
||||
} else if (testUrl.includes('google.com')) {
|
||||
isValid = response.data && response.data.toLowerCase().includes('google');
|
||||
} else {
|
||||
// 对于其他URL,只要能连接就认为有效
|
||||
isValid = true;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
ip: ip,
|
||||
port: port,
|
||||
isValid: false,
|
||||
responseTime: responseTime,
|
||||
error: error.message
|
||||
};
|
||||
if (logResult) {
|
||||
if (isValid) {
|
||||
console.log(`✓ 代理 ${ip}:${port} 验证成功,响应时间: ${responseTime}ms`);
|
||||
} else {
|
||||
console.log(`✗ 代理 ${ip}:${port} 验证失败,响应不正确`);
|
||||
}
|
||||
}
|
||||
|
||||
const result = {
|
||||
ip: ip,
|
||||
port: parseInt(port),
|
||||
isValid: isValid,
|
||||
responseTime: responseTime,
|
||||
error: null,
|
||||
testUrl: testUrl
|
||||
};
|
||||
|
||||
// 更新数据库中的验证结果(如果需要)
|
||||
if (updateDatabase) {
|
||||
try {
|
||||
await ProxyModel.updateValidity(ip, port, isValid ? 1 : 0, responseTime);
|
||||
} catch (dbError) {
|
||||
// 如果代理不在数据库中,忽略更新错误
|
||||
if (!dbError.message.includes('not found')) {
|
||||
console.warn(`更新数据库失败: ${dbError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
} catch (error) {
|
||||
const responseTime = Date.now() - startTime;
|
||||
lastError = error;
|
||||
lastResult = {
|
||||
ip: ip,
|
||||
port: parseInt(port),
|
||||
isValid: false,
|
||||
responseTime: responseTime,
|
||||
error: error.message,
|
||||
testUrl: testUrl
|
||||
};
|
||||
|
||||
// 如果不是最后一次尝试,继续重试
|
||||
if (attempt < retryCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 最后一次尝试失败
|
||||
if (logResult) {
|
||||
console.log(`✗ 代理 ${ip}:${port} 验证失败: ${error.message}`);
|
||||
}
|
||||
|
||||
// 更新数据库中的验证结果(如果需要)
|
||||
if (updateDatabase) {
|
||||
try {
|
||||
await ProxyModel.updateValidity(ip, port, 0, responseTime);
|
||||
} catch (dbError) {
|
||||
// 如果代理不在数据库中,忽略更新错误
|
||||
if (!dbError.message.includes('not found')) {
|
||||
console.warn(`更新数据库失败: ${dbError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return lastResult;
|
||||
}
|
||||
}
|
||||
|
||||
// 所有重试都失败了
|
||||
return lastResult || {
|
||||
ip: ip,
|
||||
port: parseInt(port),
|
||||
isValid: false,
|
||||
responseTime: Date.now() - startTime,
|
||||
error: lastError ? lastError.message : '验证失败',
|
||||
testUrl: testUrl
|
||||
};
|
||||
}
|
||||
|
||||
async validateSingleProxy(ip, port) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user