我们代理ip提供的不仅仅是软件

海量ip+api接口+代理服务器=一站式ip代理

首页 > 代理IP资讯 >

使用代理IP后爬虫工作遇到403怎么办

作者:admin 来源:未知 发布时间:2020-05-25

  很多朋友会遇到这样一个问题,使用代理IP访问目标网站,并且已经设置过了user-agent,获取的新IP能够正常访问,不过过一小段时间之后,就会出现大量403,这是咋回事呢。
  想着会不会是因为cookie的原因,于是给每个代理IP都设置了该IP第一次访问的时候保存对应cookie,后续访问使用对应的cookie访问。
  发现然并卵,该403还是403……
使用代理IP后爬虫工作遇到403怎么办
  /**
  *省略部分代码
  */
  String hostName=httpget.getURI().getHost();
  if(!proxyIp.getCookieList().containsKey(hostName)){
  proxyIp.getCookieList().put(hostName,new BasicCookieStore());
  }
  CloseableHttpResponse response=getHttpClient(proxyIp.getCookieList().get(hostName)).execute(httpget);
  public synchronized static CloseableHttpClient getHttpClient(CookieStore cookieStore){
  CloseableHttpClient httpClient;
  PoolingHttpClientConnectionManager cm=null;
  try{
  SSLContextBuilder builder=new SSLContextBuilder();
  builder.loadTrustMaterial(null,(TrustStrategy)(x509Certificates,s)->true);
  SSLConnectionSocketFactory sslsf=new SSLConnectionSocketFactory(builder.build(),new String[]{"SSLv2Hello","SSLv3","TLSv1","TLSv1.2"},null,NoopHostnameVerifier.INSTANCE);
  Registry<ConnectionSocketFactory>registry=RegistryBuilder.<ConnectionSocketFactory>create()
  .register("http",new PlainConnectionSocketFactory())
  .register("https",sslsf)
  .build();
  cm=new PoolingHttpClientConnectionManager(registry);
  cm.setMaxTotal(200);//max connection
  cm.setDefaultMaxPerRoute(50);
  cm.setDefaultSocketConfig(SocketConfig.custom().setSoTimeout(15*1000).build());
  }catch(Exception e){
  logger.error("获取HTTPClient出错,未处理错误!",e);
  }
  httpClient=HttpClients.custom()
  .setDefaultCookieStore(cookieStore)
  .setConnectionManager(cm)
  .setConnectionManagerShared(true)
  .setRetryHandler(new DefaultHttpRequestRetryHandler(0,false))
  .build();
  return httpClient;
  }
  其实是特别简单的问题,发现目标网站封禁策略是针对IP+user-agent的组合,于是自己在网上收集了一些常见ua,做了随机,试验发现问题已解决。
  其实在python的fake_useragent库里面有一个ua.random方法就可以随机获取ua了,简单方便。不过java貌似没有这样的东西。
  private static String[]userAgent={"Mozilla/5.0(Windows NT 6.2;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/32.0.1667.0 Safari/537.36",
  "Mozilla/5.0(Windows NT 10.0;WOW64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/66.0.3359.139 Safari/537.36",
  "Mozilla/5.0(Windows NT 5.1)AppleWebKit/537.36(KHTML,like Gecko)Chrome/41.0.2224.3 Safari/537.36",
  "Mozilla/5.0(Windows NT 6.2;Win64;x64;rv:21.0.0)Gecko/20121011 Firefox/21.0.0",
  "Mozilla/5.0(Windows NT 6.1;WOW64;rv:64.0)Gecko/20100101 Firefox/64.0",
  "Mozilla/5.0(Macintosh;Intel Mac OS X 10_10;rv:33.0)Gecko/20100101 Firefox/33.0",
  "Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/537.1(KHTML,like Gecko)Chrome/22.0.1207.1 Safari/537.1",
  "Mozilla/5.0(X11;CrOS i686 2268.111.0)AppleWebKit/536.11(KHTML,like Gecko)Chrome/20.0.1132.57 Safari/536.11",
  "Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.6(KHTML,like Gecko)Chrome/20.0.1092.0 Safari/536.6",
  "Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.6(KHTML,like Gecko)Chrome/20.0.1090.0 Safari/536.6",
  "Mozilla/5.0(Windows NT 6.2;WOW64)AppleWebKit/537.1(KHTML,like Gecko)Chrome/19.77.34.5 Safari/537.1",
  "Mozilla/5.0(X11;Linux x86_64)AppleWebKit/536.5(KHTML,like Gecko)Chrome/19.0.1084.9 Safari/536.5",
  "Mozilla/5.0(Windows NT 6.0)AppleWebKit/536.5(KHTML,like Gecko)Chrome/19.0.1084.36 Safari/536.5",
  "Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
  "Mozilla/5.0(Windows NT 5.1)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
  "Mozilla/5.0(Macintosh;Intel Mac OS X 10_8_0)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1063.0 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1062.0 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1062.0 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.1)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.1 Safari/536.3",
  "Mozilla/5.0(Windows NT 6.2)AppleWebKit/536.3(KHTML,like Gecko)Chrome/19.0.1061.0 Safari/536.3",
  "Mozilla/5.0(X11;Linux x86_64)AppleWebKit/535.24(KHTML,like Gecko)Chrome/19.0.1055.1 Safari/535.24",
  "Mozilla/5.0(Windows NT 6.2;WOW64)AppleWebKit/535.24(KHTML,like Gecko)Chrome/19.0.1055.1 Safari/535.24",
  "Mozilla/5.0(Macintosh;U;Mac OS X Mach-O;en-US;rv:2.0a)Gecko/20040614 Firefox/3.0.0",
  "Mozilla/5.0(Macintosh;U;PPC Mac OS X 10.5;en-US;rv:1.9.0.3)Gecko/2008092414 Firefox/3.0.3",
  "Mozilla/5.0(Macintosh;U;Intel Mac OS X 10.5;en-US;rv:1.9.1)Gecko/20090624 Firefox/3.5",
  "Mozilla/5.0(Macintosh;U;Intel Mac OS X 10.6;en-US;rv:1.9.2.14)Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
  "Mozilla/5.0(Macintosh;U;PPC Mac OS X 10.5;en-US;rv:1.9.2.15)Gecko/20110303 Firefox/3.6.15",
  "Mozilla/5.0(Macintosh;Intel Mac OS X 10.6;rv:2.0.1)Gecko/20100101 Firefox/4.0.1",
  "Mozilla/5.0(Windows;U;Windows NT 5.1;en-US)AppleWebKit/531.21.8(KHTML,like Gecko)Version/4.0.4 Safari/531.21.10",
  "Mozilla/5.0(Windows;U;Windows NT 5.2;en-US)AppleWebKit/533.17.8(KHTML,like Gecko)Version/5.0.1 Safari/533.17.8",
  "Mozilla/5.0(Windows;U;Windows NT 6.1;en-US)AppleWebKit/533.19.4(KHTML,like Gecko)Version/5.0.2 Safari/533.18.5",
  "Mozilla/5.0(compatible;MSIE 9.0;Windows NT 6.1;Trident/5.0",
  "Mozilla/4.0(compatible;MSIE 8.0;Windows NT 6.0;Trident/4.0)",
  "Mozilla/4.0(compatible;MSIE 7.0;Windows NT 6.0)",
  "Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.1)"
  };
  HttpGet httpget=new HttpGet(url);
  httpget.setHeader("user-agent",userAgent[random.nextInt(userAgent.length-1)]);