很多朋友会遇到这样一个问题,使用代理IP访问目标网站,并且已经设置过了user-agent,获取的新IP能够正常访问,不过过一小段时间之后,就会出现大量403,这是咋回事呢。
想着会不会是因为cookie的原因,于是给每个代理IP都设置了该IP第一次访问的时候保存对应cookie,后续访问使用对应的cookie访问。
发现然并卵,该403还是403……
/**
*省略部分代码
*/
String hostName=httpget.getURI().getHost();
if(!proxyIp.getCookieList().containsKey(hostName)){
proxyIp.getCookieList().put(hostName,new BasicCookieStore());
}
CloseableHttpResponse response=getHttpClient(proxyIp.getCookieList().get(hostName)).execute(httpget);
public synchronized static CloseableHttpClient getHttpClient(CookieStore cookieStore){
CloseableHttpClient httpClient;
PoolingHttpClientConnectionManager cm=null;
try{
SSLContextBuilder builder=new SSLContextBuilder();
builder.loadTrustMaterial(null,(TrustStrategy)(x509Certificates,s)->true);
SSLConnectionSocketFactory sslsf=new SSLConnectionSocketFactory(builder.build(),new String[]{"SSLv2Hello","SSLv3","TLSv1","TLSv1.2"},null,NoopHostnameVerifier.INSTANCE);
Registry<ConnectionSocketFactory>registry=RegistryBuilder.<ConnectionSocketFactory>create()
.register("http",new PlainConnectionSocketFactory())
.register("https",sslsf)
.build();
cm=new PoolingHttpClientConnectionManager(registry);
cm.setMaxTotal(200);//max connection
cm.setDefaultMaxPerRoute(50);
cm.setDefaultSocketConfig(SocketConfig.custom().setSoTimeout(15*1000).build());
}catch(Exception e){
logger.error("获取HTTPClient出错,未处理错误!",e);
}
httpClient=HttpClients.custom()
.setDefaultCookieStore(cookieStore)
.setConnectionManager(cm)
.setConnectionManagerShared(true)
.setRetryHandler(new DefaultHttpRequestRetryHandler(0,false))
.build();
return httpClient;
}
其实是特别简单的问题,发现目标网站封禁策略是针对IP+user-agent的组合,于是自己在网上收集了一些常见ua,做了随机,试验发现问题已解决。
其实在python的fake_useragent库里面有一个ua.random方法就可以随机获取ua了,简单方便。不过java貌似没有这样的东西。
private static String[]userAgent={"Mozilla/5.0(Windows NT 6.2;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0(Windows NT 10.0;WOW64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/66.0.3359.139 Safari/537.36",
"Mozilla/5.0(Windows NT 5.1)AppleWebKit/537.36(KHTML,like Gecko)Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0(Windows NT 6.2;Win64;x64;rv:21.0.0)Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0(Windows NT 6.1;WOW64;rv:64.0)Gecko/20100101 Firefox/64.0",
"Mozilla/5.0(Macintosh;Intel Mac OS X 10_10;rv:33.0)Gecko/20100101 Firefox/33.0",
"Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/537.1(KHTML,like Gecko)Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0(X11;CrOS i686 2268.111.0)AppleWebKit/536.11(KHTML,like Gecko)Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.6(KHTML,like Gecko)Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.6(KHTML,like Gecko)Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0(Windows NT 6.2;WOW64)AppleWebKit/537.1(KHTML,like Gecko)Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0(X11;Linux x86_64)AppleWebKit/536.5(KHTML,like Gecko)Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0(Windows NT 6.0)AppleWebKit/536.5(KHTML,like Gecko)Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0(Windows NT 5.1)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0(Macintosh;Intel Mac OS X 10_8_0)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0(Windows NT 6.1)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0(X11;Linux x86_64)AppleWebKit/535.24(KHTML,like Gecko)Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0(Windows NT 6.2;WOW64)AppleWebKit/535.24(KHTML,like Gecko)Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0(Macintosh;U;Mac OS X Mach-O;en-US;rv:2.0a)Gecko/20040614 Firefox/3.0.0",
"Mozilla/5.0(Macintosh;U;PPC Mac OS X 10.5;en-US;rv:1.9.0.3)Gecko/2008092414 Firefox/3.0.3",
"Mozilla/5.0(Macintosh;U;Intel Mac OS X 10.5;en-US;rv:1.9.1)Gecko/20090624 Firefox/3.5",
"Mozilla/5.0(Macintosh;U;Intel Mac OS X 10.6;en-US;rv:1.9.2.14)Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
"Mozilla/5.0(Macintosh;U;PPC Mac OS X 10.5;en-US;rv:1.9.2.15)Gecko/20110303 Firefox/3.6.15",
"Mozilla/5.0(Macintosh;Intel Mac OS X 10.6;rv:2.0.1)Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0(Windows;U;Windows NT 5.1;en-US)AppleWebKit/531.21.8(KHTML,like Gecko)Version/4.0.4 Safari/531.21.10",
"Mozilla/5.0(Windows;U;Windows NT 5.2;en-US)AppleWebKit/533.17.8(KHTML,like Gecko)Version/5.0.1 Safari/533.17.8",
"Mozilla/5.0(Windows;U;Windows NT 6.1;en-US)AppleWebKit/533.19.4(KHTML,like Gecko)Version/5.0.2 Safari/533.18.5",
"Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1;Trident/5.0",
"Mozilla/4.0(compatible;MSIE 8.0;Windows NT 6.0;Trident/4.0)",
"Mozilla/4.0(compatible;MSIE 7.0;Windows NT 6.0)",
"Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.1)"
};
HttpGet httpget=new HttpGet(url);
httpget.setHeader("user-agent",userAgent[random.nextInt(userAgent.length-1)]);