/// <summary>/// 首先找到首页上的所有分类的入口,然后分别获取其地址/// </summary>public static void LetsStart(){ try { NLogHelper.Info("寻找网站入口..."); var htmlSourceCode = GetHtmlByUrl(baseurl); var parser = new HtmlParser(); var document = parser.ParseDocument(htmlSourceCode); var MoreList = document.QuerySelectorAll("em > a"); List<string> list = GetItemsHref(MoreList); list.RemoveAt(2);//这个页面暂不解析 GetPageInfo(list, parser); } catch (Exception ex) { NLogHelper.Error("寻找入口出错:" + ex.Message); }}/// <summary>/// 分别进入基础入口,并取得其所对应类别的电影数量,页数,每页的条数,并将这些信息,存入redis,避免每次都执行该步浪费时间/// </summary>static void GetPageInfo(List<string> urls, HtmlParser parser){ ISyncPolicy policy2 = Policy.Handle<Exception>() .Retry(3);//重试3次 policy2.Execute(() => { foreach (string url in urls) { #region if (string.IsNullOrEmpty(redis.HashGet("pageinfo", url))) { var htmlSourceCode = GetHtmlByUrl(url); var document = parser.ParseDocument(htmlSourceCode); var pageInfo = document.QuerySelector(".co_content8 > .x"); string pattern_pagetotal = "(?<=页次:\\d\\/)\\d+"; string pattern_pageevery = "(?<=每页)\\d+"; string pattern_total = "(?<=总数)\\d+"; int page_total = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_pagetotal).Value); int page_every = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_pageevery).Value); int total = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_total).Value); redis.HashSet("pageinfo", url, page_total + "|" + page_every + "|" + total); } #endregion } GetPageList(parser); });}/// <summary>/// 逐页获取电影资源详情的链接/// </summary>static void GetPageList(HtmlParser parser){ var keys = redis.HashKeys("pageinfo"); try { foreach (var key in keys) { string url = key.ToString(); if (url.EndsWith("index.html")) url = url.Replace("index.html", "");//去掉index.html string pageinfo = redis.HashGet("pageinfo", key); string[] parts = pageinfo.Split('|'); int pagetotal = Convert.ToInt32(parts[0]); NLogHelper.Info("当前抓取地址:" + url + ",总页数:" + pagetotal + ",总条数" + Convert.ToInt32(parts[2])); DoTask(parser, 0, Convert.ToInt32(parts[1]), pagetotal, url, Convert.ToInt32(parts[2])); } } catch (Exception ex) { NLogHelper.Error("链接出错,error" + ex.Message); }}/// <summary>/// 获取电影详情及资源链接,存入数据库/// </summary>static async Task GetPageInfo(DownloadParam param){ //这一段有点长,用了很多正则去匹配数据,就不分享了}
评论