I'm using HtmlAgilityPack. Does it have a function similar to jQuery closest? (closest parent that matches a CSS selector). I tried google and the website http://html-agility-pack.net - and both don't appear to have an answer.
As there is no built-in method currently, you can write a Extension method to achieve this.
I have written a simple extension method which can be used to find elements with tagName, ID and class names that you can use.
Anyways it can be further extended easily to match other selectors.
public static class HtmlAgilityPackExtensions
{
public static HtmlNode Closest(this HtmlNode node, string jQuerySelector)
{
if (node == null) return null;
string tagName = "", id = "";
var classes = new List<string>();
if (jQuerySelector.Contains("."))
{
var parts = jQuerySelector.Split('.');
if (!string.IsNullOrWhiteSpace(parts[0]))
{
tagName = parts[0];
}
for (int i = 1; i < parts.Length; i++)
{
classes.Add(parts[i]);
}
}
if (jQuerySelector.Contains("#"))
{
var parts = jQuerySelector.Split('#');
if (!string.IsNullOrWhiteSpace(parts[0]))
{
tagName = parts[0];
}
id = parts[1];
}
if (string.IsNullOrWhiteSpace(tagName) && string.IsNullOrWhiteSpace(id) && classes.Count == 0)
{
tagName = jQuerySelector;
}
HtmlNode closestParent = null;
while (node.ParentNode != null && closestParent == null)
{
var isClosest = true;
node = node.ParentNode;
if (!string.IsNullOrWhiteSpace(tagName))
{
isClosest = node.Name == tagName;
}
if (isClosest && !string.IsNullOrWhiteSpace(id))
{
isClosest = node.Id == id;
}
if (isClosest && classes.Count > 0)
{
var classNames = node.GetAttributeValue("class", "");
if (!string.IsNullOrWhiteSpace(classNames))
{
foreach (string c in classes)
{
isClosest = classNames.Contains(c);
if (!isClosest) break;
}
}
}
if (isClosest)
{
closestParent = node;
}
}
return closestParent;
}
}
Test Code
var html = "<div><div id='parent1' class='parent'><span id='parent2' class='parent'><div id='parent3' class='parent'><div id='TestNode' class='child'>Test node</div></div></span></div></div>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
var testNode1 = htmlDoc.DocumentNode.SelectSingleNode("//div[#id='TestNode']");
if (testNode1 != null)
{
var parent1 = testNode1.Closest(".parent");
var parent2 = testNode1.Closest("#parent1");
var parent3 = testNode1.Closest("span.parent");
var nonExistingParent = testNode1.Closest("span.parent1");
}
I needed the same thing, but couldn't find any, so I wrote my own Closest function:
public static HtmlNode Closest(this HtmlNode node, string search)
{
search = search.ToLower();
while (node.ParentNode != null)
{
if (node.ParentNode.Name.ToLower() == search) return node.ParentNode;
node = node.ParentNode;
}
return null;
}
This one only works for tag names (as I needed) you can extend it to classes, attributes, and ...
Related
I have a method that runs spellcheck in a word, but what i want here is that if the word already exist in my errorlist it should not be add as error. below is my code.
public void CheckSpelling()
{
int lineno = 0;
bool start = false;
foreach (string line in _contentList)
{
lineno++;
if (line.Contains("<text>"))
{
start = true;
}
if (start)
{
foreach (Match match in Regex.Matches(line, "<.*?>[^<]+</(.*?)>", RegexOptions.IgnoreCase))
{
List<string> customdiclist = new List<string>(File.ReadAllLines(Combine(AppDomain.CurrentDomain.BaseDirectory, ConfigurationManager.AppSettings["customdic"])));
string[] strArray = Regex.Replace(match.Value, "</?[^>]+>", string.Empty).Split(' ');
foreach (string word in strArray)
{
if ((word.Trim() != string.Empty) && ((word.Substring(0, 1) != word.Substring(0, 1).ToUpper()) && !_helper.CheckSpelling(Regex.Match(word, "[a-zA-Z]+").Value) && !customdiclist.Contains(word)))
{
ErrorModel errorModel = new ErrorModel()
{
LineNumber = lineno,
ErrorMessage = "Please Check Misspelled words",
Text = word
};
ErrorList.Add(errorModel);
}
}
}
}
}
}
_helper.CheckSpelling
class Helper
{
private static readonly string DictPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, ConfigurationManager.AppSettings["dictionary"]);
private readonly Hunspell _splCheck = new Hunspell(DictPath + #"\nl_NL.aff", DictPath + #"\nl_NL.dic");
public bool CheckSpelling(string strWord)
{
if(!_splCheck.Spell(strWord.Trim()))
{
return false;
}
return true;
}
}
Can some one help, I dont want any duplicate word in my errorlist.
You can use LINQ to check if your ErrorList already contains an ErrorModel object with a Text property which has the value of the word you are checking.
Using FirstOrDefault you can check to see if it returns a null. If an item cannot be found in the list, a null will be returned as the default value because your list is a collection of class objects (whose default value is null).
If the FirstOrDefault returns null then the word has not been added to the ErrorList.
For more information see When to use .First and when to use .FirstOrDefault with LINQ?.
foreach (string word in strArray)
{
if ((word.Trim() != string.Empty) && ((word.Substring(0, 1) != word.Substring(0, 1).ToUpper()) && !_helper.CheckSpelling(Regex.Match(word, "[a-zA-Z]+").Value) && !customdiclist.Contains(word)))
{
if (ErrorList.FirstOrDefault(e => e.Text == word) == null)
{
ErrorModel errorModel = new ErrorModel()
{
LineNumber = lineno,
ErrorMessage = "Please Check Misspelled words",
Text = word
};
ErrorList.Add(errorModel);
}
}
}
I have this implementation of a tree recursive function. It gets all possible paths from A to D.
The recursive function is:
private static List<Path> GetPath(int depth, Path path)
{
if (depth == nodes.Length)
{
return new List<Path> { path };
}
else
{
var result = new List<Path>();
foreach(var link in nodes[depth].Links)
{
Node node = new Node { Name = nodes[depth].Name, Links = new[] { link } };
path.Add(node);
result.AddRange(
GetPath(
depth+1, path));
}
return result;
}
}
The expected results should be:
A1-B2->C3->D4
A1-B5->C3->D4
However, the paths returned are the same and they include all possible nodes twice.
What's wrong with the function?
foreach(var link in nodes[depth].Links)
{
Node node = new Node { Name = nodes[depth].Name, Links = new[] { link } };
path.Add(node);
You probably intend to make a new path (which is a copy of path) for each node that you find here, before appending the next node.
As #moreON's suggest, I add clone function into class Path, and modify the loop, in loop i copy to new instance of path:
public class Path : List<Node>
{
public override string ToString()
{
String s = "";
foreach (var node in this)
{
s += node.Name + node.Links[0] + "->";
}
return s;
}
public Path Clone()
{
var newPath = new Path();
ForEach(x => newPath.Add(new Node {Name = x.Name, Links = new int[] {x.Links[0]}}));
return newPath;
}
}
private static List<Path> GetPath(int depth, Path path)
{
if (depth == nodes.Length)
{
return new List<Path> { path };
}
else
{
var result = new List<Path>();
foreach (var link in nodes[depth].Links)
{
Node node = new Node { Name = nodes[depth].Name, Links = new[] { link } };
var currentPath = path.Clone();
currentPath.Add(node);
result.AddRange(GetPath(depth + 1, currentPath));
}
return result;
}
}
Hope this help.
i want to parse HTML Using HTML Agility Pack
When i am searching index with int i am getting result.
HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument htmlDocument = htmlWeb.Load("http://www.timeanddate.com/worldclock/georgia/tbilisi");
var s1 = htmlDocument.DocumentNode.Descendants().Where(x => x.HasAttributes && x.Attributes[0].Value == "ct");
But when i want to search atribute with string indexer i get an exeption.
var s2 = htmlDocument.DocumentNode.Descendants().Where(a => a.HasAttributes && a.Attributes["id"].Value == "ct");
And when i dont use LINQ and use predicate delegate everithing is Ok.
Predicate<HtmlNode> pred = new Predicate<HtmlNode>(forpred);
List<HtmlNode> ss = htmlDocument.DocumentNode.Descendants().ToList().FindAll(pred);
public static bool forpred(HtmlNode node)
{
if (node.HasAttributes)
{
foreach (HtmlAttribute atribute in node.Attributes)
{
if (atribute.Name == "id" && atribute.Value == "ct")
{
return true;
}
}
}
return false;
}
//s1.ToList()[0].InnerHtml
//s2.ToList()[0].InnerHtml
//ss[0].InnerHtml
Because some spans have attributes but not id. Your code can be like this:
var s2 = htmlDocument.DocumentNode
.Descendants()
.Where(a => a.Attributes["id"]!=null && a.Attributes["id"].Value == "ct")
.ToList();
Consider the following XML which I have to parse.
<root>
<item>
<itemId>001</itemId>
<itemName>test 1</itemName>
<description/>
</item>
</root>
I have to parse each of its tag and store it into a table as follows:
TAG_NAME TAG_VALUE IsContainer
------------ -------------- -----------
root null true
item null true
itemId 001 false
itemName test 1 false
description null false
/item null true
/root null true
Now to get this done, I am using XmlReader as this allows us to parse each & every node.
I am doing it as follows:
I created the following class to contain each tag's data
public class XmlTag
{
public string XML_TAG { get; set; }
public string XML_VALUE { get; set; }
public bool IsContainer { get; set; }
}
I am trying to get the list of tags(including closing ones) as follows:
private static List<XmlTag> ParseXml(string path)
{
var tags = new List<XmlTag>();
using (var reader = XmlReader.Create(path))
{
while (reader.Read())
{
var tag = new XmlTag();
bool shouldAdd = false;
switch (reader.NodeType)
{
case XmlNodeType.Element:
shouldAdd = true;
tag.XML_TAG = reader.Name;
//How do I get the VALUE of current reader?
//How do I determine if the current node contains children nodes to set IsContainer property of XmlTag object?
break;
case XmlNodeType.EndElement:
shouldAdd = true;
tag.XML_TAG = string.Format("/{0}", reader.Name);
tag.XML_VALUE = null;
//How do I determine if the current closing node belongs to a node which had children.. like ROOT or ITEM in above example?
break;
}
if(shouldAdd)
tags.Add(tag);
}
}
return tags;
}
but I am having difficulty determining the following:
How to determine if current ELEMENT contains children XML nodes? To set IsContainer property.
How to get the value of current node value if it is of type XmlNodeType.Element
Edit:
I have tried to use LINQ to XML as follows:
var xdoc = XDocument.Load(#"SampleItem.xml");
var tags = (from t in xdoc.Descendants()
select new XmlTag
{
XML_TAG = t.Name.ToString(),
ML_VALUE = t.HasElements ? null : t.Value,
IsContainer = t.HasElements
}).ToList();
This gives me the XML tags and their values but this does not give me ALL the tags including the closing ones. That's why I decided to try XmlReader. But If I have missed anything in LINQ to XML example, please correct me.
First of all, as noted by Jon Skeet in the comments you should probably consider using other tools, like XmlDocument possibly with LINQ to XML (EDIT: an example with XmlDocument follows).
Having said that, here is the simplest solution for what you have currently (note that it's not the cleanest possible code, and it doesn't have much validation):
private static List<XmlTag> ParseElement(XmlReader reader, XmlTag element)
{
var result = new List<XmlTag>() { element };
while (reader.Read())
{
switch (reader.NodeType)
{
case XmlNodeType.Element:
element.IsContainer = true;
var newTag = new XmlTag() { XML_TAG = reader.Name };
if (reader.IsEmptyElement)
{
result.Add(newTag);
}
else
{
result.AddRange(ParseElement(reader, newTag));
}
break;
case XmlNodeType.Text:
element.XML_VALUE = reader.Value;
break;
case XmlNodeType.EndElement:
if (reader.Name == element.XML_TAG)
{
result.Add(new XmlTag()
{
XML_TAG = string.Format("/{0}", reader.Name),
IsContainer = element.IsContainer
});
}
return result;
}
}
return result;
}
private static List<XmlTag> ParseXml(string path)
{
var result = new List<XmlTag>();
using (var reader = XmlReader.Create(path))
{
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element)
{
result.AddRange(ParseElement(
reader,
new XmlTag() { XML_TAG = reader.Name }));
}
else if (reader.NodeType == XmlNodeType.EndElement)
{
result.Add(new XmlTag()
{
XML_TAG = string.Format("/{0}",current.Name)
});
}
}
}
return result;
}
An example using XmlDocument. This will give slightly different result for self-enclosing tags (<description/> in your case). You can change this behaviour easily, depending on what you want.
private static IEnumerable<XmlTag> ProcessElement(XElement current)
{
if (current.HasElements)
{
yield return new XmlTag()
{
XML_TAG = current.Name.ToString(),
IsContainer = true
};
foreach (var tag in current
.Elements()
.SelectMany(e => ProcessElement(e)))
{
yield return tag;
}
yield return new XmlTag()
{
XML_TAG = string.Format("/{0}", current.Name.ToString()),
IsContainer = true
};
}
else
{
yield return new XmlTag()
{
XML_TAG = current.Name.ToString(),
XML_VALUE = current.Value
};
yield return new XmlTag()
{
XML_TAG = string.Format("/{0}",current.Name.ToString())
};
}
}
And using it:
var xdoc = XDocument.Load(#"test.xml");
var tags = ProcessElement(xdoc.Root).ToList();
When I write this code, I get only the parent tag value. I want to get their childnodes value also, please tell me about this.
XmlDocument DOC = new XmlDocument();
DOC.RemoveAll();
DOC.Load("C:\\Users\\DIGITEL EYE SYSTEM\\Desktop\\response.xml");
foreach (XmlNode AllNodes in ParentNode)
{
Project.Name = AllNodes["Name"].InnerText;
if (AllNodes.ChildNodes == DOC.GetElementsByTagName("AppBuilderForms"))
{
// Project.Forms = DOC.GetElementsByTagName("");
// String sb = AllNodes["Forms"].InnerText;
}
else if (AllNodes.ChildNodes==DOC.GetElementsByTagName("CheckMarkObject"))
{
checkmark.Name = AllNodes["Name"].InnerText;
checkmark.Label = AllNodes["Label"].InnerText;
// checkmark.IsChecked = AllNodes["IsChecked"].InnerText;
}
else if (ParentNode == DOC.GetElementsByTagName("DateTimeObject"))
{
DateTime.Name = AllNodes["Name"].InnerText;
DateTime.Label = AllNodes["Label"].InnerText;
}
else if (ParentNode == DOC.GetElementsByTagName("LocationObject"))
{
Location.Name = AllNodes["Name"].InnerText;
Location.Label = AllNodes["Label"].InnerText;
Location.Longitude = AllNodes["Longitude"].InnerText;
Location.Latitude = AllNodes["Latitude"].InnerText;
}
else if (ParentNode==DOC.GetElementsByTagName("SwitchObject"))
{
Switch.Name = AllNodes["Name"].InnerText;
Switch.Label = AllNodes["Label"].InnerText;
// Switch.IsChecked =AllNodes["IsChecked"].InnerText;
}
else if(ParentNode==DOC.GetElementsByTagName("TextViewObject"))
{
TextView.Name = AllNodes["Name"].InnerText;
TextView.Value = AllNodes["Value"].InnerText;
}
else if (ParentNode ==DOC.GetElementsByTagName("TextFieldObject"))
{
TextField.Name = AllNodes["Name"].InnerText;
TextField.Value = AllNodes["Value"].InnerText;
}
else if (ParentNode == DOC.GetElementsByTagName("PhotoPickerObject"))
{
PhotoPicker.Name = AllNodes["Name"].InnerText;
PhotoPicker.Label = AllNodes["Label"].InnerText;
}
else if (ParentNode == DOC.GetElementsByTagName("SpinWheelPickerObject"))
{
SpinWheelPicker.Name = AllNodes["Name"].InnerText;
SpinWheelPicker.Label = AllNodes["Label"].InnerText;
// SpinWheelPicker.Columns = AllNodes["Columns"].InnerText;
}
}
var xdoc = XDocument.Load(#"C:\Users\DIGITEL EYE SYSTEM\Desktop\response.xml");
var allElements = xdoc.Root.Elements();
foreach (string element in allElements)
{
//TODO add logic
}
First we'll load up the xml into a XDocument (needs .Net 3.5),
nothing odd going on here.
Second we'll select the root node and ALL
the elements under the root into a IEnumrable. You can add a filter here in the Elements() method.
Third we'll start iterating over the elements in our IEnumerable and implicitly
cast them to a string, this is a operator in the LINQ to XML lib
that just returns the XElement.Value (so if you think that's more
readable or need the whole Element for some other reason write
that! I.E XElement element in allElements)
Don't know how to do it in XmlDocument, I've totally forgotten, hopefully this might help you in case you'll go down that path (pun intended).