Interesting name for a title wouldn't you say? Well I couldn't find any better way to put it. Anywho, I have been working on a new program that is designed to read in a website ever so many many seconds, parse the data, and update the GUI.

Now I have finally found what I believe to be a good way to read in webpages, and I have started to work with threads, more specifically backgroundWorkers. The program itself currently contains two backgroundWorkers, one for a time elapsed function (that updates a label), and one for the reading and parsing part.

Okay so I'm now going to post the code, explain it, and then bring up the problem after that.

private void button1_Click (object sender, EventArgs e) //Start Button
{
    if (((Properties.Settings.Default.webURL).Trim() != string.Empty) && (string.IsNullOrWhiteSpace(Properties.Settings.Default.webURL) == false)) //makes sure we have a URL at all
    {
        toolStripStatusLabel1.Text = "Start Time: " + DateTime.Now.ToString();
        startTime = DateTime.UtcNow;

        button1.Enabled = false;
        button2.Enabled = true;

        if (backgroundWorker1.IsBusy != true) //thread for the elapsed time
        {
            backgroundWorker1.RunWorkerAsync(); // Start the asynchronous operation.
        }
        if (backgroundWorker2.IsBusy != true) //thread for checking page update
        {
            backgroundWorker2.RunWorkerAsync(); // Start the asynchronous operation.
        }
    }
    else //no URL so don't even try to start the program
    {
        MessageBox.Show("There is currently no URL address for where the scoreboard is located", "No URL", MessageBoxButtons.OK, MessageBoxIcon.Error);
    }
}

This code is pretty simple, click the button (start button) check to make sure there's even a URL to work with and start the backgroundWorkers

private void backgroundWorker2_DoWork (object sender, DoWorkEventArgs e) //backgroundWorker2 is used to read in the webpage data
{
    BackgroundWorker worker2 = sender as BackgroundWorker;

    while (true)
    {
        if (worker2.CancellationPending == true)
        {
            e.Cancel = true;
            break;
        }
        else
        {    
            worker2.ReportProgress(0);
            System.Threading.Thread.Sleep(refreshTimer * 1000);
        }
    }
    worker2.Dispose(); // this calls after the thread is canceled (and after the thread sleep occurs)
}

Pretty simple, keep the look going, checking every 5 secs the website (to simplify it), do this till the user clicks the cancel button or closes the program

private void backgroundWorker2_ProgressChanged (object sender, ProgressChangedEventArgs e) //used to read in the webpage data and handle it
{
    readInData.WebBrowser();

    tempHash = generateHashCode.createHash(readInData.isDownloadedData()); //generates a hash from the data read in

    if (tempHash != readInData_Hash) //if the hashes don't equal (so something changed)
    {
        tempTeamData.Clear();

        if ((readInData.isDownloadedData() != "Invalid URL") && (readInData.isDownloadedData() != "Timed Out"))
        {
            tempTeamData = stripStringData.breakUp(readInData.isDownloadedData(), numOfFlags);
            readInData.clearString();

            readInData_Hash = tempHash; //update the hash
        }
        else if (readInData.isDownloadedData() == "Invalid URL") //the URL provided was invalid
        {
            richTextBox1.AppendText("Invalid URL" + "\n");
        }
        else if (readInData.isDownloadedData() == "Timed Out") //reading the site timed out
        {
            richTextBox1.AppendText("Reading Timed Out" + "\n");
        }
    }
    else //no change so clear out the read in data
    {
        readInData.clearString();
    }
}

This is what's called every five seconds. The program reads in the webpage, hashes the string, if the hash is different (meaning the webpage data changed) the code is parsed ... of course that's assuming one of the two strings you see in the else if statement aren't passed back instead.

//===================================================================================================================
    public class readInWebpage
    {
        string downloadedData;
        string URL;
        Uri myUri;
        System.Timers.Timer aTimer;
        bool timerTrigger;
//-------------------------------------------------------------------------------------------------------------------
        public readInWebpage (string URL)
        {
            this.downloadedData = "";
            this.URL = URL;
            this.aTimer = new System.Timers.Timer(5000);
            aTimer.Elapsed += new ElapsedEventHandler(OnTimedEvent);
            this.timerTrigger = false;
        }
//-------------------------------------------------------------------------------------------------------------------
        public string isDownloadedData ()
        {
            return downloadedData;
        }
//-------------------------------------------------------------------------------------------------------------------
        public void WebBrowser ()
        {
            myUri = null;

            timerTrigger = false; //a timer system to allow the process to only work 5 secs (preventing an infinite loop)
            aTimer.Interval = 5000;
            aTimer.Enabled = true;
            aTimer.Start();

            if (Uri.TryCreate(URL, UriKind.Absolute, out myUri)) //checks to make sure the URL is even valid
            {
                WebBrowser wb = new WebBrowser();
                wb.Navigate(URL);

                wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);

                while ((wb.ReadyState != WebBrowserReadyState.Complete) && (timerTrigger == false))
                {
                    Application.DoEvents();
                }

                if (timerTrigger == false)
                {
                    aTimer.Stop();
                    downloadedData = wb.Document.Body.InnerHtml; //Added this line, because the final HTML takes a while to show up 
                }
                else
                {
                    downloadedData = "Timed Out";
                }

                wb.Dispose();
            }
            else //invalid URL
            {
                downloadedData = "Invalid URL";
            }

//===================================================================================================================

So here's where it all comes down to. This program works rather nicely, espeically for what I need. The problem is the Applications.DoEvents(). This is a dangerous line of code, without it, the program locks up, doing it's job but the GUI doesn't get updated and the user can't interact with the GUI (some reason the backgroundWorkers don't count for threading here ... even though it's called, but whatever).

Now the code here does check to make sure the URL is valid before going into all this work, but there's a problem here. Sometimes a website can appear valid but trying to access it will be meet with a super long load time that will eventually lead to page can not be displayed or never stop loading.

This is where the Application.DoEvents becomes dangerous. This little line (from what I have come to make out) pretty much goes "okay program keep running, don't get held up on this webpage loading" (it's like multiprogramming, dang you OS class). Well if a webpage never loads completely, I could have thread upon thread opening up.

So I came up with what I felt would be a fix, place a timer on that while statement. More specially a boolean variable as you see in the code. When the timer goes off it triggeres and event, changed the boolean, and we're all good to go, problem avoided ... except it didn't work that way. Something to do with the Application.DoEvents() is blocking it. I commented out this line, placed a breakpoint on the event for completeion of timer and boom it hit the point (of course the whole GUI locks up so not going to work), and when I left the line in, the timer never went off.

Well that's a lot to read, and I hope you guys are still here with me (details mean better help in my opinon). My problem is, is there a way I can implement a timer that will actually work while still leaving in the Application.DoEvents()? Or some other option to my build that can allow me the luxury of these features? Again I am kind of new to threading and have had issues in the past with reading in websites.

Thanks in advance for any and all help.

Recommended Answers

All 6 Replies

Lines 36 and 38 should be reversed, attach handlers before calling the methods that will raise the event.

You have a method with the same name as a class, this is usually frowned on.

WebBrowser.Navigate is an asynchronous method, you shouldn't need to call DoEvents() at all. You should be handling the successful (or failed) loading in the event, not after the call to Navigate.

You never unsubcribe to the event. This can prevent the succesful Dispose() of the WebBrowser.

If the timer triggers, you never stop it.

I'd rethink this whole class and it's methods. The timer tick event should be canceling the Navigation() call and loading in the "failed" message. The DocumentCompleted handler should be canceling the timer and determining if the page loaded. You should set the timer to only trigger once (Timer.AutoReset Property). You don't need the Boolean value at all.

Thank you Momerath for the post, I have begun to look at the class and am starting to see about some of the problems you mentioned, this class is looking to become a nightmare.

Actually the Event does trigger but then the followint lines 45 - 49 also execute as well (in other words that while locks the class up and then allows it to move on ... pretty much prevent the class from closing until the DocumentCompleted event it triggered.

So by that means the following line "downloadedData = wb.Document.Body.InnerHtml;", is actually used twice.

I know this sounds bad but do you think you could give me an example of a re-write of this class? The program must wait until the whole webpage is done loading (or a timer is thrown), and while this does it I am starting to see how much this code I found sucks!

(I am going to try and tweak it tonight when I get back from some stuff I have to do, but you're write this class does need work)

Here is something I threw together that should give you the basic idea. The Object is used to prevent both the page loading and the timer tick happening at the same time (can't have both trying to set the value of the Document). The Boolean is used to tell if the other method has already run but hasn't had time to stop the current method from running.

using System;
using System.Timers;
using System.Windows.Forms;

namespace WindowsFormsApplication4 {
    class Webpage : IDisposable {
        Boolean doProcess;
        Object sentinal;
        System.Timers.Timer clock;
        Uri uri;
        WebBrowser wb;

        public String Document { get; private set; }

        public Webpage(String uriString) {
            if (Uri.TryCreate(uriString, UriKind.RelativeOrAbsolute, out uri) == false) {
                throw new ArgumentException("Invalid uri");
            }

            doProcess = true;
            sentinal = new Object();

            clock = new System.Timers.Timer();
            clock.AutoReset = false;
            clock.Interval = 5000;
            clock.Elapsed += new ElapsedEventHandler(clock_Elapsed);

            wb = new WebBrowser();
            wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);

        }

        public void Process() {
            wb.Navigate(uri);
            clock.Start();
        }

        void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) {
            lock (sentinal) {
                if (doProcess) {
                    doProcess = false;
                    Document = wb.DocumentText;
                    clock.Stop();       
                }
            }
        }

        void clock_Elapsed(object sender, ElapsedEventArgs e) {
            lock (sentinal) {
                if (doProcess) {
                    doProcess = false;
                    Document = "Timeout loading URI";
                    wb.Stop();
                }
            }
        }

        public void Dispose() {
            if (wb != null) {
                wb.Dispose();
                wb = null;
            }
            if (clock != null) {
                clock.Dispose();
                clock = null;
            }
        }
    }
}

Thanks for this, I have already begun to take a look into this (more making sense of the whole thing).

My god this makes me feel like an amature programmer, I mean granted the code I was using was partially built by someone else and modified by me, but still (I assume it comes with experience?)

On a side note, I find it really funny you used the lock statment. I say this as I have never seen it before and after Gooogling I figured out it's pretty much the concept of protecting the Critical Region (Mutual Exclusion) which I just learned about in my OS class

But once again thanks with this (I'll hopefully take a crack and implementing this ... well after tweaking it, and let you know what happens)

UPDATE:

Okay so I implemented Momerath's code (changed a few variable names, but that's pretty much it). Anyway it seems to work well, not entirely sure. I call the Process(), and then the code moves on and the following line yells at me cause the string is empty.

Pretty much I have to make the main part of the program (backgroundWorker2_ProgressChanged) wait before executing line 5 (whether it be one of the errors or the page has indeed loaded).

I know I could probably lock it off with some while loop, but I would rather avoid this (due to some of the reasons Busy-Waiting for protecting the critical region isn't used).

I have a feeling Delegates might need to be used ... of course this is from what I understand about delegates (in otherwords they calls a class once done from another class). So it would call a class in Form1 (my main) from the code Momerath's code (from one of either of the events).

I assume I am understand that right? But then if I am how will that effect the threads? (as you can tell my knowledge of Delegates is very limited)

Well for now I think I found a possible fix. I ended up trying something with my original webpage reader class (seeing as I had issues with the one posted above, more me just probably misunderstanding something)

Now if I used Application.DoEvents() the timer refused to trigger, however I discovered if I didn't use Application.DoEvents(), well the DocumentCompleted would never be called and the webpage would always time out.

Well I went off to google, found other people who had used similar builds. I stumboled onto this on a few ... don't use a timer, use DateTime

So I removed all the timer related coding and modified my readInWebpage class into something like this (note that the class was rebuild so there will be some differences).

namespace Webpage_AutomatedLogin_1v1
{
//============================================================================================================
    class readInWebpage_NoThreadSupport : IDisposable
    {
        WebBrowser wb;
        bool timerTriggered;
//------------------------------------------------------------------------------------------------------------
        public readInWebpage_NoThreadSupport ()
        {
            wb = new WebBrowser();
            wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);

            timerTriggered = false;
        }
//------------------------------------------------------------------------------------------------------------
        public string downloadedData
        {
            get;
            private set;
        }
//------------------------------------------------------------------------------------------------------------
        public void readInData (Uri webLink, int secs)
        {
            DateTime timeNow = DateTime.Now;
            wb.Navigate(webLink);

            TimeSpan elapsedTime;
            while (wb.ReadyState != WebBrowserReadyState.Complete)
            {
                elapsedTime = DateTime.Now - timeNow;
                if (elapsedTime.Seconds > secs) //This function does indeed work for a timed out (and supports Application.DoEvents() which seems to be needed for wb_DocumentCompleted)
                {
                    timerTriggered = true;
                    break;
                }
                Application.DoEvents();
            }

            if (timerTriggered == false)
            {
                downloadedData = wb.Document.Body.InnerHtml; //Added this line, because the final HTML takes a while to show up 
                    //!!!! This seems redundent so hold up on it
            }
            else
            {
                wb.Stop();
                downloadedData = "Timed Out";
            }
        }
//------------------------------------------------------------------------------------------------------------
        void wb_DocumentCompleted (object sender, WebBrowserDocumentCompletedEventArgs e) //when the webpage has finished loading (read it)
        {
            WebBrowser wb = (WebBrowser) sender;
            downloadedData = wb.Document.Body.InnerHtml;
        }
//------------------------------------------------------------------------------------------------------------
        public void Dispose () //used for disposing items
        {
            if (wb != null)
            {
                wb.Dispose();
                wb = null;
            }
        }
//------------------------------------------------------------------------------------------------------------
    }
//============================================================================================================
}

Now of course future testing will be needed, but so far it is producing results, let's hope this is the soluution

(By the way if anyone sees a real hazard in this please let me know, I want to assume this is a solution, but I have seen this before where I think I have a solution and I don't)

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.