/// <summary>
/// 首先找到首页上的所有分类的入口,然后分别获取其地址
/// </summary>
public static void LetsStart()
{
try
{
NLogHelper.Info("寻找网站入口...");
var htmlSourceCode = GetHtmlByUrl(baseurl);
var parser = new HtmlParser();
var document = parser.ParseDocument(htmlSourceCode);
var MoreList = document.QuerySelectorAll("em > a");
List<string> list = GetItemsHref(MoreList);
list.RemoveAt(2);//这个页面暂不解析
GetPageInfo(list, parser);
}
catch (Exception ex)
{
NLogHelper.Error("寻找入口出错:" + ex.Message);
}
}
/// <summary>
/// 分别进入基础入口,并取得其所对应类别的电影数量,页数,每页的条数,并将这些信息,存入redis,避免每次都执行该步浪费时间
/// </summary>
static void GetPageInfo(List<string> urls, HtmlParser parser)
{
ISyncPolicy policy2 = Policy.Handle<Exception>()
.Retry(3);//重试3次
policy2.Execute(() =>
{
foreach (string url in urls)
{
#region
if (string.IsNullOrEmpty(redis.HashGet("pageinfo", url)))
{
var htmlSourceCode = GetHtmlByUrl(url);
var document = parser.ParseDocument(htmlSourceCode);
var pageInfo = document.QuerySelector(".co_content8 > .x");
string pattern_pagetotal = "(?<=页次:\\d\\/)\\d+";
string pattern_pageevery = "(?<=每页)\\d+";
string pattern_total = "(?<=总数)\\d+";
int page_total = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_pagetotal).Value);
int page_every = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_pageevery).Value);
int total = Convert.ToInt32(Regex.Match(pageInfo.OuterHtml, pattern_total).Value);
redis.HashSet("pageinfo", url, page_total + "|" + page_every + "|" + total);
}
#endregion
}
GetPageList(parser);
});
}
/// <summary>
/// 逐页获取电影资源详情的链接
/// </summary>
static void GetPageList(HtmlParser parser)
{
var keys = redis.HashKeys("pageinfo");
try
{
foreach (var key in keys)
{
string url = key.ToString();
if (url.EndsWith("index.html"))
url = url.Replace("index.html", "");//去掉index.html
string pageinfo = redis.HashGet("pageinfo", key);
string[] parts = pageinfo.Split('|');
int pagetotal = Convert.ToInt32(parts[0]);
NLogHelper.Info("当前抓取地址:" + url + ",总页数:" + pagetotal + ",总条数" + Convert.ToInt32(parts[2]));
DoTask(parser, 0, Convert.ToInt32(parts[1]), pagetotal, url, Convert.ToInt32(parts[2]));
}
}
catch (Exception ex)
{
NLogHelper.Error("链接出错,error" + ex.Message);
}
}
/// <summary>
/// 获取电影详情及资源链接,存入数据库
/// </summary>
static async Task GetPageInfo(DownloadParam param)
{
//这一段有点长,用了很多正则去匹配数据,就不分享了
}
评论