Scraping HTML DOM elements using HtmlAgilityPack in ASP.NET - c#

I am Scraping HTML DOM elements using HtmlAgilityPack in ASP.NET. currently my code is loading all the href links which means that sublinks of sublinks also . But I need only the depending URL of my domain URL. I don't know how to write code for it. Can any one help me to do this?
Here is my code:
public void GetURL(string strGetURL)
{
var getHtmlSource = new HtmlWeb();
var document = new HtmlDocument();
try
{
document = getHtmlSource.Load(strGetURL);
var aTags = document.DocumentNode.SelectNodes("//a");
if (aTags != null)
{
outputurl.Text = string.Empty;
int _count = 0;
foreach (var aTag in aTags)
{
string strURLTmp;
strURLTmp = aTag.Attributes["href"].Value;
if (_count != 0)
{
if (!CheckDuplicate(strURLTmp))
{
lstResults.Add(strURLTmp);
outputurl.Text += strURLTmp + "\n";
counter++;
GetURL(strURLTmp);
}
}
_count++;
}
}
}

If you meant to get URL that contains specific domain, you can change the XPath to be :
//a[contains(#href, 'your domain here')]
Or if you prefer LINQ than XPath :
var aTags = document.DocumentNode.SelectNodes("//a");
if (aTags != null)
{
....
var relevantLinks = aTags.Where(o => o.GetAttributeValue("href", "")
.Contains("your domain here")
);
....
}
GetAttributeValue() is a better way to get value of an attribute using HAP. Instead of returning null which may cause exception, this method returns the 2nd parameter when the attribute is not found in the context node.

Related

C# Webscraper to grab amount of Google Results given a specific search term

I've been working on a webscraper as a Windows Forms application in C#. The user enter a search term and the term and the program will then split the search string for each individual words and look up the amount of search results through Yahoo and Google.
My issue lies with the orientation of the huge HTML document. I've tried multiple approaches such as
iterating recursively and comparing ids aswell as with lamba and the Where statements. Both results in null. I also manually looked into the html document to make sure the id of the div I want exist in the document.
The id I'm looking for is "resultStats" but it is suuuuuper nested. My code looks like this:
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace WebScraper2._0
{
public class Webscraper
{
private string Google = "http://google.com/#q=";
private string Yahoo = "http://search.yahoo.com/search?p=";
private HtmlWeb web = new HtmlWeb();
private HtmlDocument GoogleDoc = new HtmlDocument();
private HtmlDocument YahooDoc = new HtmlDocument();
public Webscraper()
{
Console.WriteLine("Init");
}
public int WebScrape(string searchterms)
{
//Console.WriteLine(searchterms);
string[] ssize = searchterms.Split(new char[0]);
int YahooMatches = 0;
int GoogleMatches = 0;
foreach (var term in ssize)
{
//Console.WriteLine(term);
var y = web.Load(Yahoo + term);
var g = web.Load(Google + term + "&cad=h");
YahooMatches += YahooFilter(y);
GoogleMatches += GoogleFilter(g);
}
Console.WriteLine("Yahoo found " + YahooMatches.ToString() + " matches");
Console.WriteLine("Google found " + GoogleMatches.ToString() + " matches");
return YahooMatches + GoogleMatches;
}
//Parse to get correct info
public int YahooFilter(HtmlDocument doc)
{
//Look for node with correct ID
IEnumerable<HtmlNode> nodes = doc.DocumentNode.Descendants().Where(n => n.HasClass("mw-jump-link"));
foreach (var item in nodes)
{
// displaying final output
Console.WriteLine(item.InnerText);
}
//TODO: Return search resultamount.
return 0;
}
int testCounter = 0;
string toReturn = "";
bool foundMatch = false;
//Parse to get correct info
public int GoogleFilter(HtmlDocument doc)
{
if (doc == null)
{
Console.WriteLine("Null");
}
foreach (var node in doc.DocumentNode.ChildNodes)
{
toReturn += Looper(node, testCounter, toReturn, foundMatch);
}
Console.WriteLine(toReturn);
/*
var stuff = doc.DocumentNode.Descendants("div")
.Where(node => node.GetAttributeValue("id", "")
.Equals("extabar")).ToList();
IEnumerable<HtmlNode> nodes = doc.DocumentNode.Descendants().Where(n => n.HasClass("appbar"));
*/
return 0;
}
public string Looper(HtmlNode node, int counter, string returnstring, bool foundMatch)
{
Console.WriteLine("Loop started" + counter.ToString());
counter++;
Console.WriteLine(node.Id);
if (node.Id == "resultStats")
{
returnstring += node.InnerText;
}
foreach (HtmlNode n in node.Descendants())
{
Looper(n, counter, returnstring, foundMatch);
}
return returnstring;
}
}
}
I made an google HTML Scraper a few weeks ago, a few things to consider
First: Google don't like when you try to Scrape their Search HTML, while i was running a list of companies trying to get their addresses and phone number, Google block my IP from accessing their website for a little bit (Which cause a hilarious panic in the office)
Second: Google will change the HTML (Id names and etc) of the page so using ID's won't work, on my case i used the combination of HTML Tags and specific information to parse the response and extract the information that i wanted.
Third: It's better to just use their API to grab the information you need, just make sure you respect their free tier query limit and you should be golden.
Here is the Code i used.
public static string getBetween(string strSource, string strStart, string strEnd)
{
int Start, End;
if (strSource.Contains(strStart) && strSource.Contains(strEnd))
{
Start = strSource.IndexOf(strStart, 0) + strStart.Length;
End = strSource.IndexOf(strEnd, Start);
return strSource.Substring(Start, End - Start);
}
else
{
return "";
}
}
public void SearchResult()
{
//Run a Google Search
string uriString = "http://www.google.com/search";
string keywordString = "Search String";
WebClient webClient = new WebClient();
NameValueCollection nameValueCollection = new NameValueCollection();
nameValueCollection.Add("q", keywordString);
webClient.QueryString.Add(nameValueCollection);
string result = webClient.DownloadString(uriString);
string search = getBetween(result, "Address", "Hours");
rtbHtml.Text = getBetween(search, "\">", "<");
}
On my case i used the String Address and Hours to limit what information i wanted to extract.
Edit: Fixed the Logic and added the Code i used.
Edit2: forgot to add the GetBetween Class. (sorry it's my first Answer)

Handling selectSingleNode(Xpath) in XML/Soap envelope having elements with same prefix and multiple uri

I have a Soap packet with same prefix ns2 for multiple uri: http://schemas.datacontract.org/2004/07/TestServices, http://schemas.datacontract.org/2004/07/TestServices.DTO.
When I am trying to set value in XDoc using xDoc.SelectSingleNode(xPath, oManager).InnerXml , it is giving null because of multiple uri conflict. Is there any way to dynamically hanlde this? I cannot do any changes in xpath as it is coming from different services with different format. My namespace manager code is as follows:
public static XmlNamespaceManager getAllNamespaces(XmlDocument xDoc)
{
XmlNamespaceManager result = new XmlNamespaceManager(xDoc.NameTable);
IDictionary<string, string> localNamespaces = null;
XPathNavigator xNav = xDoc.CreateNavigator();
while (xNav.MoveToFollowing(XPathNodeType.Element))
{
localNamespaces = xNav.GetNamespacesInScope(XmlNamespaceScope.Local);
foreach (var localNamespace in localNamespaces)
{
string prefix = localNamespace.Key;
if (string.IsNullOrEmpty(prefix))
prefix = "DEFAULT";
result.AddNamespace(prefix, localNamespace.Value);
}
}
return result;
}
foreach (Telerik.WinControls.UI.RadTreeNode tn in treeView.SelectedNode.Nodes)
{
if (tn.Text == r.Cells[0].Value.ToString())
{
string xPath = tn.FullPath;
xPath = "//" + xPath.Replace(#"\", "/").Replace(" ", "");
if (!(r.Cells[2].Value.ToString().StartsWith("<") && r.Cells[2].Value.ToString().EndsWith(">")) && !(r.Cells[2].Value.ToString().StartsWith("[") && r.Cells[2].Value.ToString().EndsWith("]")))
{
xDoc.SelectSingleNode(xPath, oManager).InnerXml = r.Cells[2].Value.ToString();
}
}
}
My soap envelope is as follows:
<s11:Envelope xmlns:s11='http://schemas.xmlsoap.org/soap/envelope/'>
<s11:Body>
<ns1:AccountCreation xmlns:ns1='www.test.com'>
<ns1:Data>
<ns2:LoginUser xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices'>?XXX?</ns2:LoginUser>
<ns2:UserPassword xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices'>?XXX?</ns2:UserPassword>
<ns2:IPAddress xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices'>?XXX?</ns2:IPAddress>
<ns2:ProductID xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices.DTO'>?999?</ns2:ProductID>
<ns2:BusinessAccountNumber xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices.DTO'>?999.99?</ns2:BusinessAccountNumber>
<ns2:StoreName xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices.DTO'>?XXX?</ns2:StoreName>
<ns2:Title xmlns:ns2='http://schemas.datacontract.org/2004/07/TestServices.DTO'>?XXX?</ns2:Title>
</ns1:Data>
</ns1:AccountCreation>
</s11:Body>
</s11:Envelope>

C# web scraper navigate to aspx link

I have a C# Windows Phone 8.1 app which I am building. Part of the app needs to go and look for information on a specific web page. One of the fields which I need is a URL which can be found on certain items on the page, however I am finding that the URL is in a relative-style format
FullArticle.aspx?a=323495
I am wondering if there is a way in C# using HtmlAgilityPack, HttpWebRequest etc etc to find the link to the actual page. Code snippet is below.
private static TileUpdate processSingleNewsItem(HtmlNode newsItemNode)
{
Debug.WriteLine("");
var articleImage = getArticleImage(getNode(newsItemNode, "div", "nw-container-panel-articleimage"));
var articleDate = getArticleDate(getNode(newsItemNode, "div", "nw-container-panel-articledate"));
var articleSummary = getArticleSummary(getNode(newsItemNode, "div", "nw-container-panel-textarea"));
var articleUrl = getArticleUrl(getNode(newsItemNode, "div", "nw-container-panel-articleimage"));
return new TileUpdate{
Date = articleDate,
Headline = articleSummary,
ImagePath = articleImage,
Url = articleUrl
};
}
private static string getArticleUrl(HtmlNode parentNode)
{
var imageNode = parentNode.Descendants("a").FirstOrDefault();
Debug.WriteLine(imageNode.GetAttributeValue("href", null));
return imageNode.GetAttributeValue("href", null);
}
private static HtmlNode getNode(HtmlNode parentNode, string nodeType, string className)
{
var children = parentNode.Elements(nodeType).Where(o => o.Attributes["class"].Value == className);
return children.First();
}
Would appreciate any ideas or solutions. Cheers!
In my web crawler here's what I do:
foreach (HtmlNode link in doc.DocumentNode.SelectNodes(#"//a[#href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att == null) continue;
string href = att.Value;
if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // ignore javascript on buttons using a tags
Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);
// Make it absolute if it's relative
if (!urlNext.IsAbsoluteUri)
{
urlNext = new Uri(urlRoot, urlNext);
}
...
}

HTML Agility Pack - Filter Href Value Results

I'm working on a web scraper. The following text shows the results of the code given at the end of this question, which gets the values of all hrefs from a page.
I only want to get values that contain docid=
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64556e8988583f
#
summary_of_documents.php
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64579b861c1d7b
#
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64579e0509c7f0&apform=judiciary
decisions.php?doctype=Decisions / Signed
Resolutions&docid=1263778435388003271#sam
decisions.php?doctype=Decisions / Signed
Resolutions&docid=12637789021669321156#sam
?doctype=Decisions / Signed Resolutions&year=1986&month=January#head
?doctype=Decisions / Signed Resolutions&year=1986&month=February#head
Here's the code:
string url = urlTextBox.Text;
string sourceCode = Extractor.getSourceCode(url);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(sourceCode);
List<string> links = new List<string>();
if (links != null)
{
foreach (HtmlAgilityPack.HtmlNode nd in doc.DocumentNode.SelectNodes("//a[#href]"))
{
links.Add(nd.Attributes["href"].Value);
}
}
else
{
MessageBox.Show("No Links Found");
}
if (links != null)
{
foreach (string str in links)
{
richTextBox9.Text += str + "\n";
}
}
else
{
MessageBox.Show("No Link Values Found");
}
How can I do this?
Why not just replace this:
links.Add(nd.Attributes["href"].Value);
with this:
if (nd.Attributes["href"].Value.Contains("docid="))
links.Add(nd.Attributes["href"].Value);

Build HtmlGenericControl from a string of full html

I want to be able to add attributes to a string of html without having to build a parser to handle the html. In one specific case, I want to be able to extract the id of the html or insert an id to the html server side.
Say I have:
string stringofhtml = "<img src=\"someimage.png\" alt=\"the image\" />";
I would like to be able to do something like:
HtmlGenericControl htmlcontrol = new HtmlGenericControl(stringofhtml);
htmlcontrol.Attributes["id'] = "newid";
OR
int theid = htmlcontrol.Attributes["id"];
This is just a way that I can access/add attributes of the html strings that I have.
You can do this:
HtmlGenericControl ctrl = new HtmlGenericControl();
ctrl.InnerHtml = "<img src=\"someimage.png\" alt=\"the image\" />";
You could always use a LiteralControl too, instead of an HtmlGenericControl:
LiteralControl lit = new LiteralControl(stringOfHtml);
I do not think there is a control available which will provide you with the functionality you are looking for.
Below I have made use of the HtmlAgility pack to parse/query the HTML and created a new control subclassing the Literal control.
This control accepts an HTML string, checks to ensure it contains at least a single element and provides access to get/set that elements attributes.
Example usage
string image = "<img src=\"someimage.png\" alt=\"the image\" />";
HtmlControlFromString htmlControlFromString = new HtmlControlFromString(image);
htmlControlFromString.Attributes["id"] = "image2";
string id = htmlControlFromString.Attributes["id"];
Control
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI.WebControls;
using HtmlAgilityPack;
public class HtmlControlFromString : Literal
{
private HtmlDocument _document = new HtmlDocument();
private HtmlNode _htmlElement;
public AttributesCollection Attributes { get; set; }
public HtmlControlFromString(string html)
{
_document.LoadHtml(html);
if (_document.DocumentNode.ChildNodes.Count > 0)
{
_htmlElement = _document.DocumentNode.ChildNodes[0];
Attributes = new AttributesCollection(_htmlElement);
Attributes.AttributeChanged += new EventHandler(Attributes_AttributeChanged);
SetHtml();
}
else
{
throw new InvalidOperationException("Argument does not contain a valid html element.");
}
}
void Attributes_AttributeChanged(object sender, EventArgs e)
{
SetHtml();
}
void SetHtml()
{
Text = _htmlElement.OuterHtml;
}
}
public class AttributesCollection
{
public event EventHandler AttributeChanged;
private HtmlNode _htmlElement;
public string this[string attribute]
{
get
{
HtmlAttribute htmlAttribute = _htmlElement.Attributes[attribute];
return htmlAttribute == null ? null : htmlAttribute.Value;
}
set
{
HtmlAttribute htmlAttribute = _htmlElement.Attributes[attribute];
if (htmlAttribute == null)
{
htmlAttribute = _htmlElement.OwnerDocument.CreateAttribute(attribute);
htmlAttribute.Value = value;
_htmlElement.Attributes.Add(htmlAttribute);
}
else
{
htmlAttribute.Value = value;
}
EventHandler attributeChangedHandler = AttributeChanged;
if (attributeChangedHandler != null)
attributeChangedHandler(this, new EventArgs());
}
}
public AttributesCollection(HtmlNode htmlElement)
{
_htmlElement = htmlElement;
}
}
Hope this helps.

Categories

Resources