HttpClient技术入门 网络爬虫通常都是使用Http协议访问互联网,所以HttpClient这个同样是Http协议的客户端技术就被运用到了爬虫技术中 只需要在maven引入HttpClient就可以使用
简单示例 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 public static void main (String[] args) { CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("https://www.baidu.com" ); CloseableHttpResponse httpResponse = null ; try { httpResponse = httpClient.execute(httpGet); } catch (IOException e) { e.printStackTrace(); } if (httpResponse.getStatusLine().getStatusCode() == 200 ){ HttpEntity httpEntity = httpResponse.getEntity(); try { String context = EntityUtils.toString(httpEntity,"utf-8" ); System.out.println(context); } catch (IOException e) { e.printStackTrace(); } } }
使用步骤 1. 构建HTTP请求 1 2 3 4 5 6 7 8 9 10 11 12 13 HttpGet httpGet = new HttpGet("https://www.baidu.com" ); URI uri = new URIBuilder().setScheme("http" ) .setHost("https://www.baidu.com" ) .setPath("/s" ) .setParameter("ie" ,"utf-8" ) .setParameter("f" ,"8" ) .setParameter("rsv_bp" ,"1" ) .setParameter("tn" ,"80035161_2_dg" ) .build(); HttpGet httpGet = new HttpGet(uri);
2. 添加消息头 1 2 3 4 5 6 HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,HttpStatus.SC_OK,"OK" ); response.addHeader("Set-Cookie" ,"c1=a; path=/; domain=localhost" ); response.addHeader("Set-Cookie" ,"c2=b; path=\"/\", c3=c; domain=\"localhost\"" ); Header h1 = response.getFirstHeader("Set-Cookie" ); Header h2 = response.getLastHeader("Set-Cookie" ); Header[] hs = response.getHeaders("Set-Cookie" );
3. 生成HTTP实体 1 2 3 4 5 6 7 8 9 10 11 12 StringEntity myEntity = new StringEntity("important message" ,ContentType.create("text/plain" ,"UTF-8" )); File file = new File("onefile.txt" ); FileEntity entity = new FileEntity(file,contentType.create("text/plain" ,"UTF-8" )); List<NameValuePair> formparam = new ArrayList<NameValuePair>(); formparam.add(new BasicNameValuePair("param1" ,"value1" )); formparam.add(new BasicNameValuePair("param2" ,"value2" )); UrlEncodeFormEntity entity = new UrlEncodedFormEntity(formparam,Consts.UTF_8);
4. 配置请求信息 1 2 3 4 5 6 7 8 RequestConfig requestConfig = RequestConfig.custom() .setConnectTimeout(1000 ) .setConnectionRequestTimeout(500 ) .setSocketTimeout(10 *1000 ) .build(); httpGet.setConfig(requestConfig);
连接池的应用 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 public static void main (String[] args) { PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(); doGet(poolingHttpClientConnectionManager,"https://www.baidu.com/s" ,"wt" ,"httpclient" ); } private static String doGet (PoolingHttpClientConnectionManager poolingHttpClientConnectionManager,String url,String...param) { CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build(); URIBuilder uriBuilder; HttpGet httpGet = null ; try { uriBuilder = new URIBuilder(url); for (int i=0 ;(i+1 )<param.length;i+=2 ) uriBuilder.setParameter(param[i],param[i+1 ]); httpGet = new HttpGet(uriBuilder.build()); } catch (URISyntaxException e) { e.printStackTrace(); } CloseableHttpResponse httpResponse = null ; try { httpResponse = httpClient.execute(httpGet); if (httpResponse.getStatusLine().getStatusCode() == 200 ){ HttpEntity httpEntity = httpResponse.getEntity(); String context = EntityUtils.toString(httpEntity,"utf-8" ); System.out.println(context); return context; } } catch (IOException e) { e.printStackTrace(); } finally { try { httpResponse.close(); } catch (IOException e) { e.printStackTrace(); } } return null ; }
Anything can go right will go right