How To Fetch All Links From a URL
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="FetchLink.aspx.cs" Inherits="FetchLink" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title>Untitled Page</title>
</head>
<body>
<form id="form1" runat="server">
<div>
<table style="width: 50%">
<tr>
<td style="width: 100px">
Enter URL</td>
<td style="width: 100px">
<asp:TextBox ID="TextBox1" runat="server" Width="243px"></asp:TextBox>
<asp:Button ID="Button1" runat="server" Text="Fetch Link" OnClick="Button1_Click" /></td>
</tr>
<tr>
<td style="width: 100px">
</td>
<td style="width: 100px">
<asp:GridView ID="GridView1" runat="Server">
</asp:GridView>
</td>
</tr>
</table>
</div>
</form>
</body>
</html>
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
public partial class FetchLink : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
public static List<string> GetAllUrlsFromUri(Uri urlToScrape)
{
//the list that would hold the urls
List<string> listOfUrls = new List<string>();
//the search pattern that we are going to use for our regular expression
string searchPattern = "href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))";
//get the contents of the page and put it to a string
string pageContents = GetPageContents(urlToScrape);
//our regular expression should ignore case
Regex regEx = new Regex(searchPattern, RegexOptions.IgnoreCase);
//get all the maching values generated by our regular expression
Match match = regEx.Match(pageContents);
//loop thru all the matching strings
while (match.Success)
{
//assign the match value to a temporary placeholder
string urlFound = match.Value;
//check to see if the url does not include the full path(e.g: default.aspx)
if (listOfUrls.IndexOf(urlFound) < 0)
{
string urlToAdd = urlFound;
if (urlFound.StartsWith("href=\"javascript:"))
{
//do nothing, we need to display it as is.
}
else if (urlFound.StartsWith("href=\"/") || !urlFound.StartsWith("href=\"http://"))
{
//add the scrape url to the beginning of our found string
urlToAdd = urlFound.Insert(6, urlToScrape.OriginalString);
}
//add the url to our list
listOfUrls.Add(urlToAdd);
}
//move to the next match result
match = match.NextMatch();
}
//return the list of urls that we have recovered from the site
return listOfUrls;
}
/// <summary>
/// Reads a webpage and captures it html representation into a string
/// </summary>
/// <param name="urlToScrape">the website you want to read</param>
/// <returns>the html representation of the site</returns>
private static string GetPageContents(Uri urlToScrape)
{
HttpWebResponse httpWebResponse = null;
StreamReader streamReader = null;
string pageContents = String.Empty;
try
{
//create a webrequest object for the url
WebRequest webRequest = WebRequest.Create(urlToScrape);
//convert the webrequest to an httpwebrequest
HttpWebRequest httpWebRequest = (HttpWebRequest)webRequest;
//assign a timeout value for the process
httpWebRequest.Timeout = 100000;
//create a webresponse object to hold the response generated for our request
WebResponse webResponse = httpWebRequest.GetResponse();
//convert the webresponse to httpwebresponse
httpWebResponse = (HttpWebResponse)webResponse;
//get the response stream and assign it to our streamreader
streamReader = new StreamReader(httpWebResponse.GetResponseStream());
//read the contents of the stream
pageContents = streamReader.ReadToEnd();
}
catch (Exception ex)
{
//buble up the error
throw ex;
}
finally
{
//close our webresponse object
httpWebResponse.Close();
//close our streamreader object
streamReader.Close();
}
//return the page contents
return pageContents;
}
/// <summary>
/// Saves our list of urls to a text file
/// </summary>
/// <param name="listOfUrls">the list containing the urls</param>
/// <returns>the filename created for the file</returns>
public static string SaveToFile(List<string> listOfUrls)
{
//the file name
string fileName = String.Format("{0}.{1}", Guid.NewGuid(), "txt");
//create a streamwriter for our file
StreamWriter sw = File.CreateText(fileName);
//loop thru each string in our collection
foreach (string url in listOfUrls)
{
//write the string to our file
sw.WriteLine(url);
}
//close oour streamwriter
sw.Close();
//return our filename
return fileName;
}
protected void Button1_Click(object sender, EventArgs e)
{
//the url to scrape
Uri urlToScrape = new Uri(TextBox1.Text);
//the list that would contain the urls recovered from the specified uri
List<string> listOfUrls = GetAllUrlsFromUri(urlToScrape);
GridView1.DataSource = listOfUrls;
GridView1.DataBind();
}
}