From 52267147e624f3a0daef4870ba72f023ef9162a9 Mon Sep 17 00:00:00 2001
From: username@email.com <yzy2002yzy@163.com>
Date: 星期六, 07 十月 2023 12:46:29 +0800
Subject: [PATCH] 修改中国政府采购网爬虫规则
---
zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs | 153 ++++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 138 insertions(+), 15 deletions(-)
diff --git a/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs b/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs
index 0dce7a2..f836bd6 100644
--- a/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs
+++ b/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs
@@ -3,7 +3,10 @@
using System;
using System.Collections.Generic;
using System.Globalization;
+using System.IO;
+using System.IO.Compression;
using System.Linq;
+using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text;
@@ -171,7 +174,7 @@
//閲囪喘鍏憡
- public static void caigougonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page, string cgfs, string cgfsName)
+ public static async void caigougonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page, string cgfs, string cgfsName)
{
sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A"));
@@ -185,12 +188,36 @@
string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page);
try
{
- using (HttpClient client = new HttpClient())
+ HttpClientHandler handler = new HttpClientHandler();
+ handler.CookieContainer = new CookieContainer();
+ using (HttpClient client = new HttpClient(handler))
{
client.Timeout = TimeSpan.FromSeconds(60);
+ client.DefaultRequestHeaders.Add("Accept", "*/*");
+ client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br");
+ client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn");
//List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>();
HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result;
- var res = response.Content.ReadAsStringAsync().Result;
+ string res = "";
+ if (response.IsSuccessStatusCode)
+ {
+ using (var responseStream = await response.Content.ReadAsStreamAsync())
+ {
+ using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
+ {
+ using (var reader = new StreamReader(decompressedStream))
+ {
+ res = await reader.ReadToEndAsync();
+ // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹�
+ }
+ }
+ }
+ }
+ else
+ {
+ // 澶勭悊璇锋眰澶辫触鐨勬儏鍐�
+ }
+ //var res = response.Content.ReadAsStringAsync().Result;
var document = parser.ParseDocument(res);
var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault();
var contentList = sssdfsdfsd.QuerySelector("ul");
@@ -542,7 +569,7 @@
//鎰忓悜鍏紑
- public static void yixianggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime,string endTime, int page)
+ public static async void yixianggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime,string endTime, int page)
{
sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A"));
@@ -556,12 +583,36 @@
string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page);
try
{
- using (HttpClient client = new HttpClient())
+ HttpClientHandler handler = new HttpClientHandler();
+ handler.CookieContainer = new CookieContainer();
+ using (HttpClient client = new HttpClient(handler))
{
client.Timeout = TimeSpan.FromSeconds(60);
+ client.DefaultRequestHeaders.Add("Accept", "*/*");
+ client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br");
+ client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn");
//List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>();
HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result;
- var res = response.Content.ReadAsStringAsync().Result;
+ //var res = response.Content.ReadAsStringAsync().Result;
+ string res = "";
+ if (response.IsSuccessStatusCode)
+ {
+ using (var responseStream = await response.Content.ReadAsStreamAsync())
+ {
+ using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
+ {
+ using (var reader = new StreamReader(decompressedStream))
+ {
+ res = await reader.ReadToEndAsync();
+ // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹�
+ }
+ }
+ }
+ }
+ else
+ {
+ // 澶勭悊璇锋眰澶辫触鐨勬儏鍐�
+ }
var document = parser.ParseDocument(res);
var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault();
var contentList = sssdfsdfsd.QuerySelector("ul");
@@ -857,7 +908,7 @@
//缁撴灉鍏憡
- public static void zhongbiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
+ public static async void zhongbiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
{
sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A"));
@@ -871,12 +922,36 @@
string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page);
try
{
- using (HttpClient client = new HttpClient())
+ HttpClientHandler handler = new HttpClientHandler();
+ handler.CookieContainer = new CookieContainer();
+ using (HttpClient client = new HttpClient(handler))
{
client.Timeout = TimeSpan.FromSeconds(60);
+ client.DefaultRequestHeaders.Add("Accept", "*/*");
+ client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br");
+ client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn");
//List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>();
HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result;
- var res = response.Content.ReadAsStringAsync().Result;
+ //var res = response.Content.ReadAsStringAsync().Result;
+ string res = "";
+ if (response.IsSuccessStatusCode)
+ {
+ using (var responseStream = await response.Content.ReadAsStreamAsync())
+ {
+ using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
+ {
+ using (var reader = new StreamReader(decompressedStream))
+ {
+ res = await reader.ReadToEndAsync();
+ // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹�
+ }
+ }
+ }
+ }
+ else
+ {
+ // 澶勭悊璇锋眰澶辫触鐨勬儏鍐�
+ }
var document = parser.ParseDocument(res);
var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault();
var contentList = sssdfsdfsd.QuerySelector("ul");
@@ -1224,7 +1299,7 @@
}
//鏇存鍏憡
- public static void gengzhenggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
+ public static async void gengzhenggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
{
sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A"));
@@ -1238,12 +1313,36 @@
string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page);
try
{
- using (HttpClient client = new HttpClient())
+ HttpClientHandler handler = new HttpClientHandler();
+ handler.CookieContainer = new CookieContainer();
+ using (HttpClient client = new HttpClient(handler))
{
client.Timeout = TimeSpan.FromSeconds(60);
+ client.DefaultRequestHeaders.Add("Accept", "*/*");
+ client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br");
+ client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn");
//List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>();
HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result;
- var res = response.Content.ReadAsStringAsync().Result;
+ //var res = response.Content.ReadAsStringAsync().Result;
+ string res = "";
+ if (response.IsSuccessStatusCode)
+ {
+ using (var responseStream = await response.Content.ReadAsStreamAsync())
+ {
+ using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
+ {
+ using (var reader = new StreamReader(decompressedStream))
+ {
+ res = await reader.ReadToEndAsync();
+ // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹�
+ }
+ }
+ }
+ }
+ else
+ {
+ // 澶勭悊璇锋眰澶辫触鐨勬儏鍐�
+ }
var document = parser.ParseDocument(res);
var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault();
var contentList = sssdfsdfsd.QuerySelector("ul");
@@ -1539,7 +1638,7 @@
//搴熸爣鍏憡
- public static void feibiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
+ public static async void feibiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page)
{
sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A"));
@@ -1553,12 +1652,36 @@
string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page);
try
{
- using (HttpClient client = new HttpClient())
+ HttpClientHandler handler = new HttpClientHandler();
+ handler.CookieContainer = new CookieContainer();
+ using (HttpClient client = new HttpClient(handler))
{
//List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>();
client.Timeout = TimeSpan.FromSeconds(60);
+ client.DefaultRequestHeaders.Add("Accept", "*/*");
+ client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br");
+ client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn");
HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result;
- var res = response.Content.ReadAsStringAsync().Result;
+ //var res = response.Content.ReadAsStringAsync().Result;
+ string res = "";
+ if (response.IsSuccessStatusCode)
+ {
+ using (var responseStream = await response.Content.ReadAsStreamAsync())
+ {
+ using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
+ {
+ using (var reader = new StreamReader(decompressedStream))
+ {
+ res = await reader.ReadToEndAsync();
+ // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹�
+ }
+ }
+ }
+ }
+ else
+ {
+ // 澶勭悊璇锋眰澶辫触鐨勬儏鍐�
+ }
var document = parser.ParseDocument(res);
var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault();
var contentList = sssdfsdfsd.QuerySelector("ul");
--
Gitblit v1.9.1