From 52267147e624f3a0daef4870ba72f023ef9162a9 Mon Sep 17 00:00:00 2001 From: username@email.com <yzy2002yzy@163.com> Date: 星期六, 07 十月 2023 12:46:29 +0800 Subject: [PATCH] 修改中国政府采购网爬虫规则 --- zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs | 153 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 files changed, 138 insertions(+), 15 deletions(-) diff --git a/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs b/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs index 0dce7a2..f836bd6 100644 --- a/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs +++ b/zhengcaioa/Crawler/sichuan/CcgpSichuanoperation.cs @@ -3,7 +3,10 @@ using System; using System.Collections.Generic; using System.Globalization; +using System.IO; +using System.IO.Compression; using System.Linq; +using System.Net; using System.Net.Http; using System.Net.Http.Headers; using System.Text; @@ -171,7 +174,7 @@ //閲囪喘鍏憡 - public static void caigougonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page, string cgfs, string cgfsName) + public static async void caigougonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page, string cgfs, string cgfsName) { sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A")); @@ -185,12 +188,36 @@ string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page); try { - using (HttpClient client = new HttpClient()) + HttpClientHandler handler = new HttpClientHandler(); + handler.CookieContainer = new CookieContainer(); + using (HttpClient client = new HttpClient(handler)) { client.Timeout = TimeSpan.FromSeconds(60); + client.DefaultRequestHeaders.Add("Accept", "*/*"); + client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br"); + client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn"); //List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>(); HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result; - var res = response.Content.ReadAsStringAsync().Result; + string res = ""; + if (response.IsSuccessStatusCode) + { + using (var responseStream = await response.Content.ReadAsStreamAsync()) + { + using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress)) + { + using (var reader = new StreamReader(decompressedStream)) + { + res = await reader.ReadToEndAsync(); + // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹� + } + } + } + } + else + { + // 澶勭悊璇锋眰澶辫触鐨勬儏鍐� + } + //var res = response.Content.ReadAsStringAsync().Result; var document = parser.ParseDocument(res); var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault(); var contentList = sssdfsdfsd.QuerySelector("ul"); @@ -542,7 +569,7 @@ //鎰忓悜鍏紑 - public static void yixianggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime,string endTime, int page) + public static async void yixianggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime,string endTime, int page) { sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A")); @@ -556,12 +583,36 @@ string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page); try { - using (HttpClient client = new HttpClient()) + HttpClientHandler handler = new HttpClientHandler(); + handler.CookieContainer = new CookieContainer(); + using (HttpClient client = new HttpClient(handler)) { client.Timeout = TimeSpan.FromSeconds(60); + client.DefaultRequestHeaders.Add("Accept", "*/*"); + client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br"); + client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn"); //List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>(); HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result; - var res = response.Content.ReadAsStringAsync().Result; + //var res = response.Content.ReadAsStringAsync().Result; + string res = ""; + if (response.IsSuccessStatusCode) + { + using (var responseStream = await response.Content.ReadAsStreamAsync()) + { + using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress)) + { + using (var reader = new StreamReader(decompressedStream)) + { + res = await reader.ReadToEndAsync(); + // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹� + } + } + } + } + else + { + // 澶勭悊璇锋眰澶辫触鐨勬儏鍐� + } var document = parser.ParseDocument(res); var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault(); var contentList = sssdfsdfsd.QuerySelector("ul"); @@ -857,7 +908,7 @@ //缁撴灉鍏憡 - public static void zhongbiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) + public static async void zhongbiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) { sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A")); @@ -871,12 +922,36 @@ string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page); try { - using (HttpClient client = new HttpClient()) + HttpClientHandler handler = new HttpClientHandler(); + handler.CookieContainer = new CookieContainer(); + using (HttpClient client = new HttpClient(handler)) { client.Timeout = TimeSpan.FromSeconds(60); + client.DefaultRequestHeaders.Add("Accept", "*/*"); + client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br"); + client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn"); //List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>(); HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result; - var res = response.Content.ReadAsStringAsync().Result; + //var res = response.Content.ReadAsStringAsync().Result; + string res = ""; + if (response.IsSuccessStatusCode) + { + using (var responseStream = await response.Content.ReadAsStreamAsync()) + { + using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress)) + { + using (var reader = new StreamReader(decompressedStream)) + { + res = await reader.ReadToEndAsync(); + // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹� + } + } + } + } + else + { + // 澶勭悊璇锋眰澶辫触鐨勬儏鍐� + } var document = parser.ParseDocument(res); var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault(); var contentList = sssdfsdfsd.QuerySelector("ul"); @@ -1224,7 +1299,7 @@ } //鏇存鍏憡 - public static void gengzhenggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) + public static async void gengzhenggonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) { sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A")); @@ -1238,12 +1313,36 @@ string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page); try { - using (HttpClient client = new HttpClient()) + HttpClientHandler handler = new HttpClientHandler(); + handler.CookieContainer = new CookieContainer(); + using (HttpClient client = new HttpClient(handler)) { client.Timeout = TimeSpan.FromSeconds(60); + client.DefaultRequestHeaders.Add("Accept", "*/*"); + client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br"); + client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn"); //List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>(); HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result; - var res = response.Content.ReadAsStringAsync().Result; + //var res = response.Content.ReadAsStringAsync().Result; + string res = ""; + if (response.IsSuccessStatusCode) + { + using (var responseStream = await response.Content.ReadAsStreamAsync()) + { + using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress)) + { + using (var reader = new StreamReader(decompressedStream)) + { + res = await reader.ReadToEndAsync(); + // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹� + } + } + } + } + else + { + // 澶勭悊璇锋眰澶辫触鐨勬儏鍐� + } var document = parser.ParseDocument(res); var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault(); var contentList = sssdfsdfsd.QuerySelector("ul"); @@ -1539,7 +1638,7 @@ //搴熸爣鍏憡 - public static void feibiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) + public static async void feibiaogonggao(WebCrawlerContext _ccontext, string sichuanpageurll, string startTime, string endTime , int page) { sichuanpageurll = sichuanpageurll.Replace("start_time=", "start_time=" + startTime.Replace("-", "%3A")); @@ -1553,12 +1652,36 @@ string sichuanpageurl2 = sichuanpageurll.Replace("page_index=1", "page_index=" + page); try { - using (HttpClient client = new HttpClient()) + HttpClientHandler handler = new HttpClientHandler(); + handler.CookieContainer = new CookieContainer(); + using (HttpClient client = new HttpClient(handler)) { //List<sichuanjieshoudtl> data = new List<sichuanjieshoudtl>(); client.Timeout = TimeSpan.FromSeconds(60); + client.DefaultRequestHeaders.Add("Accept", "*/*"); + client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate, br"); + client.DefaultRequestHeaders.Add("Host", "search.ccgp.gov.cn"); HttpResponseMessage response = client.GetAsync(sichuanpageurl2).Result; - var res = response.Content.ReadAsStringAsync().Result; + //var res = response.Content.ReadAsStringAsync().Result; + string res = ""; + if (response.IsSuccessStatusCode) + { + using (var responseStream = await response.Content.ReadAsStreamAsync()) + { + using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress)) + { + using (var reader = new StreamReader(decompressedStream)) + { + res = await reader.ReadToEndAsync(); + // 澶勭悊瑙e帇缂╁悗鐨勫搷搴斿唴瀹� + } + } + } + } + else + { + // 澶勭悊璇锋眰澶辫触鐨勬儏鍐� + } var document = parser.ParseDocument(res); var sssdfsdfsd = document.All.Where(m => m.ClassName == "vT-srch-result-list").FirstOrDefault(); var contentList = sssdfsdfsd.QuerySelector("ul"); -- Gitblit v1.9.1