i want to parse HTML Using HTML Agility Pack
When i am searching index with int i am getting result.
HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument htmlDocument = htmlWeb.Load("http://www.timeanddate.com/worldclock/georgia/tbilisi");
var s1 = htmlDocument.DocumentNode.Descendants().Where(x => x.HasAttributes && x.Attributes[0].Value == "ct");
But when i want to search atribute with string indexer i get an exeption.
var s2 = htmlDocument.DocumentNode.Descendants().Where(a => a.HasAttributes && a.Attributes["id"].Value == "ct");
And when i dont use LINQ and use predicate delegate everithing is Ok.
Predicate<HtmlNode> pred = new Predicate<HtmlNode>(forpred);
List<HtmlNode> ss = htmlDocument.DocumentNode.Descendants().ToList().FindAll(pred);
public static bool forpred(HtmlNode node)
{
if (node.HasAttributes)
{
foreach (HtmlAttribute atribute in node.Attributes)
{
if (atribute.Name == "id" && atribute.Value == "ct")
{
return true;
}
}
}
return false;
}
//s1.ToList()[0].InnerHtml
//s2.ToList()[0].InnerHtml
//ss[0].InnerHtml
Because some spans have attributes but not id. Your code can be like this:
var s2 = htmlDocument.DocumentNode
.Descendants()
.Where(a => a.Attributes["id"]!=null && a.Attributes["id"].Value == "ct")
.ToList();
Related
Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}
i have successfully accessed itemId, galleryURL, title, viewItemURL value under item collections but problem is i need to access another value which is not directly under item collections it just inside another child. Please have a look on this picture of XML to get better idea. Please advice me how can i access listingInfo->watchCount
public ActionResult Search(string OperationName, string calltype, string page, string keywords, string type, string location, string condition, string min, string max, string negative, string minFeedback, string maxFeedback, string drange, string categoryId)
{
string AppId = "demo-key"; //api configs
string BaseUrl = "http://svcs.ebay.com/services/search/FindingService/v1?OPERATION-NAME="; //base url api end point
if (calltype == "categoryClick")
{
string Url = BaseUrl + OperationName + "&SERVICE-VERSION=1.0.0&SECURITY-APPNAME=" + AppId + "&RESPONSE-DATA-FORMAT=XML&REST-PAYLOAD&categoryId=" + categoryId + "&paginationInput.entriesPerPage=2&paginationInput.pageNumber=" + page + "";
var items = new List<EbayDataViewModel>();
XDocument xdoc = XDocument.Load(Url);
// Since i am only interested in <item> collections within <searchResult>
var searchResultItems = xdoc.Descendants()
.Where(x => x.Name.LocalName == "item");
foreach (var sri in searchResultItems)
{
// Get all child xml elements
var childElements = sri.Elements();
var itemId = childElements.FirstOrDefault(x => x.Name.LocalName == "itemId");
var imageurl = childElements.FirstOrDefault(x => x.Name.LocalName == "galleryURL");
var title = childElements.FirstOrDefault(x => x.Name.LocalName == "title");
var url = childElements.FirstOrDefault(x => x.Name.LocalName == "viewItemURL");
// var numberofwatch = childElements.Where(x => x.Name.LocalName == "listinginfo"); this is one step inside of another element
//add items from xml data to EbayDataViewModel object
items.Add(new EbayDataViewModel
{
ItemId = itemId == null ? String.Empty : itemId.Value,
EbayImageUrl = imageurl == null ? String.Empty : imageurl.Value,
EbayTitle = title == null ? String.Empty : title.Value,
EbayUrl = url == null ? String.Empty : url.Value,
//NumberOfWatch = numberofwatch == null ? String.Empty : numberofwatch.Value,
});
}
var e = Json(items);
return Json(items);
}else{
}
return null;
}
You can use XPathSelectElement (see MSDN XPathSelectElement)
string str =
#"<?xml version=""1.0""?>
<sri>
<item><listingInfo><watchCount>1</watchCount></listingInfo></item>
<item><listingInfo><watchCount>2</watchCount></listingInfo></item>
</sri>";
XDocument xdoc = XDocument.Parse(str);
var searchResultItems = xdoc.Descendants().Where(x => x.Name.LocalName == "item");
foreach (var item in searchResultItems)
{
var wc = item.XPathSelectElement("listingInfo/watchCount");
Console.WriteLine(wc.Value);
}
This line of code solves your problem i think
var nofwatch = childElements.FirstOrDefault(x => x.Name.LocalName ==
"listingInfo").Elements().FirstOrDefault(x => x.Name.LocalName ==
"watchCount");
foreach (var sri in searchResultItems)
{
// Get all child xml elements
var childElements = sri.Elements();
var itemId = childElements.FirstOrDefault(x => x.Name.LocalName == "itemId");
var imageurl = childElements.FirstOrDefault(x => x.Name.LocalName == "galleryURL");
var title = childElements.FirstOrDefault(x => x.Name.LocalName == "title");
var url = childElements.FirstOrDefault(x => x.Name.LocalName == "viewItemURL");
var nofwatch = childElements.FirstOrDefault(x => x.Name.LocalName == "listingInfo").Elements().FirstOrDefault(x => x.Name.LocalName == "watchCount");
//add items from xml data to EbayDataViewModel object
items.Add(new EbayDataViewModel
{
ItemId = itemId == null ? String.Empty : itemId.Value,
EbayImageUrl = imageurl == null ? String.Empty : imageurl.Value,
EbayTitle = title == null ? String.Empty : title.Value,
EbayUrl = url == null ? String.Empty : url.Value,
NumberOfWatch = nofwatch == null ? String.Empty : nofwatch.Value,
});
}
I'm using HtmlAgilityPack. Does it have a function similar to jQuery closest? (closest parent that matches a CSS selector). I tried google and the website http://html-agility-pack.net - and both don't appear to have an answer.
As there is no built-in method currently, you can write a Extension method to achieve this.
I have written a simple extension method which can be used to find elements with tagName, ID and class names that you can use.
Anyways it can be further extended easily to match other selectors.
public static class HtmlAgilityPackExtensions
{
public static HtmlNode Closest(this HtmlNode node, string jQuerySelector)
{
if (node == null) return null;
string tagName = "", id = "";
var classes = new List<string>();
if (jQuerySelector.Contains("."))
{
var parts = jQuerySelector.Split('.');
if (!string.IsNullOrWhiteSpace(parts[0]))
{
tagName = parts[0];
}
for (int i = 1; i < parts.Length; i++)
{
classes.Add(parts[i]);
}
}
if (jQuerySelector.Contains("#"))
{
var parts = jQuerySelector.Split('#');
if (!string.IsNullOrWhiteSpace(parts[0]))
{
tagName = parts[0];
}
id = parts[1];
}
if (string.IsNullOrWhiteSpace(tagName) && string.IsNullOrWhiteSpace(id) && classes.Count == 0)
{
tagName = jQuerySelector;
}
HtmlNode closestParent = null;
while (node.ParentNode != null && closestParent == null)
{
var isClosest = true;
node = node.ParentNode;
if (!string.IsNullOrWhiteSpace(tagName))
{
isClosest = node.Name == tagName;
}
if (isClosest && !string.IsNullOrWhiteSpace(id))
{
isClosest = node.Id == id;
}
if (isClosest && classes.Count > 0)
{
var classNames = node.GetAttributeValue("class", "");
if (!string.IsNullOrWhiteSpace(classNames))
{
foreach (string c in classes)
{
isClosest = classNames.Contains(c);
if (!isClosest) break;
}
}
}
if (isClosest)
{
closestParent = node;
}
}
return closestParent;
}
}
Test Code
var html = "<div><div id='parent1' class='parent'><span id='parent2' class='parent'><div id='parent3' class='parent'><div id='TestNode' class='child'>Test node</div></div></span></div></div>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
var testNode1 = htmlDoc.DocumentNode.SelectSingleNode("//div[#id='TestNode']");
if (testNode1 != null)
{
var parent1 = testNode1.Closest(".parent");
var parent2 = testNode1.Closest("#parent1");
var parent3 = testNode1.Closest("span.parent");
var nonExistingParent = testNode1.Closest("span.parent1");
}
I needed the same thing, but couldn't find any, so I wrote my own Closest function:
public static HtmlNode Closest(this HtmlNode node, string search)
{
search = search.ToLower();
while (node.ParentNode != null)
{
if (node.ParentNode.Name.ToLower() == search) return node.ParentNode;
node = node.ParentNode;
}
return null;
}
This one only works for tag names (as I needed) you can extend it to classes, attributes, and ...
I want to make a funtion to input value (funName) and check XML file attribute (FunName) then output XML file attribute (isEnable) boolean true or false
How can I modify this code?
My XML file
<itema>
<itemb FunName="ABC" isEnable="true"></itemb>
<itemb FunName="DEF" isEnable="false"></itemb>
</itema>
My Code
public bool FunEnable(string funName , string isEnable)
{
bool result = true;
XmlDocument xDL = new XmlDocument();
xDL.Load("C://XMLFile2.xml"); //Load XML file
XmlNode xSingleNode = xDL.SelectSingleNode("//itemb");
XmlAttributeCollection xAT = xSingleNode.Attributes; //read all Node attribute
for (int i = 0; i < xAT.Count; i++)
{
if (xAT.Item(i).Name == "isEnable")
{
Console.WriteLine(xAT.Item(i).Value); //read we want attribute content
}
}
return result;
}
Thanks a lot
Well you can try this :
public static bool FunEnable(string funNam)
{
bool result = true;
XmlDocument xDL = new XmlDocument();
xDL.Load(#"C:/XMLFile2.xml"); //Load XML file
XmlNodeList nodeList = xDL.SelectNodes("//itemb");
foreach (XmlNode node in nodeList)
{
if (node.Attributes["FunName"].Value.Equals(funNam))
{
result = Convert.ToBoolean(node.Attributes["isEnable"].Value);
break;
}
}
Console.WriteLine("with funName = "+ funNam +" isEnable equal to : " + result);
return result;
}
Output
with funName = ABC isEnable equal to : True
This is fairly trivial using LINQ to XML. You can load the document using XDocument.Load and then get your isEnable value like so:
var result = doc.Descendants("itemb")
.Where(e => (string)e.Attribute("FunName") == "ABC")
.Select(e => (bool)e.Attribute("isEnable"))
.Single();
You can see a working demo here: https://dotnetfiddle.net/MYTOl6
var xDoc = XDocument.Load(path);
bool result = (from itemb in xDoc.Descendants("itemb")
where itemb.Attribute("FunName").Value == funcName
select itemb.Attribute("isEnable").Value == "true")
.FirstOrDefault();
Well, I prefer Linq to XML..
Maybe that one works:
public bool FunEnable(string funName, string isEnable)
{
bool result = true;
XDocument xDL = XDocument.Load("C://XMLFile2.xml");
var xSingleNode = from node in xDL.Descendants("itemb")
where node.Attribute("FunName").Value == funName
select node;
if(xSingleNode.Count() > 0)
{
result = xSingleNode.ElementAt(0).Attribute("isEnable").Value == "true";
//If there is at least one node with the given name, result is set to the first nodes "isEnable"-value
}
return result;
}
Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}