C# web data extraction code debugging

Question

tendaishava 0 Newbie Poster

13 Years Ago

hello guys my code is not giving me any errors but wont execute the task it just loops can, anyone help me debug it. its a code for extracting book information from a chinese website. thanks in advance.

using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Threading.Tasks;

namespace BookInfoCrawler1
{
  class ChinaPubCrawler : DeepCrawler
  {
    public ChinaPubCrawler(String keyword)
      : base(keyword)
    {
    }

    public override void update()
    {
      String url = "http://search.china-pub.com/s/?key1=" + _keyword + "&type=&pz=1&t=2";
      String pageContent = getPageContent(url);

      if (getBookCount(pageContent) > 0)
      {
        getPageCount(pageContent);

        DateTime dt1 = DateTime.Now;
        if(true)
        {
          // use concurrent method simply
          ConcurrentStack<String> pageData = new ConcurrentStack<String>();
          Parallel.For(1, _webInfo._pageCount+1, pageIndex =>
          {
            String newUrl = url + "&page=" + pageIndex;
            pageData.Push(getPageContent(newUrl));
          });

          while (pageData.TryPop(out pageContent))
          {
            foreach (string oneBook in getAllBook(pageContent))
            {
              extractBookInfo(oneBook);
            }
          }
        }
        else
        {
          // traditional method

         // {
       //     String newUrl = url + "&page=" + pageIndex;
        //    pageContent = getPageContent(newUrl);

        //    foreach (string oneBook in getAllBook(pageContent))
        //    {
          //    extractBookInfo(oneBook);
          //  }
    //      }
       }

        DateTime dt2 = DateTime.Now;
        _costTime = (dt2 - dt1).TotalMilliseconds;
      }

      if (_webInfo._bookCount != _bookInfos.Count)
      {
        _webInfo._bookCount = _bookInfos.Count;
      }
    }

    protected override int getBookCount(String pageContent)
    {
      try
      {
        String keyword = "break";
        String pattern = generateTagPattern(keyword);
        Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
        MatchCollection mc = reg.Matches(pageContent);

        if (mc.Count == 0)
        {
          Console.WriteLine("No matching book!");
        }
        else
        {
          String bookStuff = mc[0].Value;
          //<p>使用 <span>xml</span> 搜索，共有 <span>534  种</span> 商品</p>
          String bookPattern = "<span>([0-9]+?).+</span>";
          Regex bookReg = new Regex(bookPattern, RegexOptions.IgnoreCase);

          _webInfo._bookCount = Convert.ToInt32(bookReg.Match(bookStuff).Groups[1].Value);

          Console.WriteLine("OK, we got " + _webInfo._bookCount + " books now!");
        }
      }
      catch (System.Exception webEx)
      {
        Console.WriteLine(webEx.Message.ToString());
      }

      return _webInfo._bookCount;
    }

    protected override int getPageCount(String pageContent)
    {
      _webInfo._pageCount = Convert.ToInt32(Math.Ceiling(_webInfo._bookCount / 20.0));
      return _webInfo._pageCount;
    }

    protected override List<String> getAllBook(String pageContent)
    {
      List<String> allBooks = new List<String>();
      try
      {
        String majorPattern = generateTagPattern("listview");
        Regex majorReg = new Regex(majorPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
        Match majorMatch = majorReg.Match(pageContent);

        // <pre><a target="_blank" href="http://product.china-pub.com/29851">
        String pattern = "<pre><a target=\"_blank\" href=\"(.+?)\"";
        Regex reg = new Regex(pattern, RegexOptions.IgnoreCase);
        MatchCollection mc = reg.Matches(majorMatch.Value);

        if (mc.Count > 0)
        {
          //foreach (Match match in mc)
          //{
          //  allBooks.Add(match.Value);
          //}

          // use concurrent method simply
          ConcurrentStack<String> pageData = new ConcurrentStack<String>();
          Parallel.For(0, mc.Count, idx =>
          {
            pageData.Push(getPageContent(mc[idx].Value));
          });

          String tmpPageContent;
          while (pageData.TryPop(out tmpPageContent))
          {
            allBooks.Add(tmpPageContent);
          }
        }
        else
        {
          Console.WriteLine("No matching content!");
        }
      }
      catch (System.Exception webEx)
      {
        Console.WriteLine(webEx.Message.ToString());
      }

      return allBooks;
    }

    protected override void extractBookInfo(String bookContent)
    {
      BookInfo bookInfo;
      bookInfo._name = String.Empty;
      bookInfo._author = String.Empty;
      bookInfo._description = String.Empty;
      bookInfo._price = String.Empty;
      bookInfo._discount = String.Empty;
      bookInfo._webSite = "www.china-pub.com";

      Match _match;

      #region BookName
      // <h1 class="black15c" id='js_shuming'>Java Web开发详解--XML+XSLT+Servlet+JSP深入剖析与实例应用 (被《程序员》等机构评选为2006年最受读者喜爱的十大IT图书之一)</h1>
      String keyword = "js_shuming";
      String pattern = "<(?<HtmlTag>[\\w]+)[^>]*\\s[iI][dD]=(?<Quote>[\"']?)" + keyword + "(?(Quote)\\k<Quote>)[\"']?[^>]*>((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*</\\k<HtmlTag>>"; ;
      Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
      MatchCollection mc = reg.Matches(bookContent);
      bookInfo._name = mc[0].Value;
      #endregion

      #region BookAuthor
      String mainPattern = generateTagPattern("lcon more-infos");
      Regex mainReg = new Regex(mainPattern, RegexOptions.IgnoreCase);
      Match mainMatch = mainReg.Match(bookContent);

      String authorPattern = "<a href=.+?<strong>(.+?)</strong>";
      Regex authorReg = new Regex(authorPattern, RegexOptions.IgnoreCase);

      mc = authorReg.Matches(bookContent);
      foreach (Match match in mc)
      {
        bookInfo._author += match.Groups[1].Value + " ";
      }
      bookInfo._author = bookInfo._author.Trim();
      #endregion

      #region BookDescription
      String descriPattern = generateTagPattern("neirong");
      Regex descriReg = new Regex(descriPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);

      _match = descriReg.Match(bookContent);
      bookInfo._description = _match.Groups[1].Value;
      bookInfo._description = bookInfo._description.Replace("<br />", String.Empty);
      bookInfo._description = bookInfo._description.Replace("\n", " ");
      bookInfo._description = bookInfo._description.Trim();
      #endregion

      #region BookPrice
      mainPattern = generateTagPattern("price-area");
      mainReg = new Regex(mainPattern, RegexOptions.IgnoreCase|RegexOptions.Singleline);
      _match = mainReg.Match(bookContent);

      String pricePattern = "<li>.+?￥(.+?)</li>";
      Regex priceReg = new Regex(pricePattern, RegexOptions.IgnoreCase);

      _match = priceReg.Match(_match.Value);
      bookInfo._price = _match.Groups[1].Value;
      bookInfo._price = bookInfo._description.Trim();
      #endregion

      #region discount
      String discountPattern = "\"discount\">(.+?)<";
      Regex discountReg = new Regex(discountPattern, RegexOptions.IgnoreCase);

      _match = discountReg.Match(bookContent);
      bookInfo._discount = _match.Groups[1].Value;
      #endregion

      // make sure it is a right book what we need
      if (bookInfo._name.Contains(_keyword))
      {
        _bookInfos.Add(bookInfo);
      }
    }

  }
}

c c# c++ java-jsp listview regex

2 Contributors
1 Reply
337 Views
14 Hours Discussion Span
Latest Post 13 Years Ago Latest Post by skatamatic

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

skatamatic 371 Practically a Posting Shark · Answer 1 · 2012-05-03T18:39:21+00:00

Have you tried stepping through it? I see some fishy looking lines, namely:

if (true) 
...
else

Seems like some useless logic. Try stepping through it and post your findings.