Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}
Related
Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}
i want to parse HTML Using HTML Agility Pack
When i am searching index with int i am getting result.
HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument htmlDocument = htmlWeb.Load("http://www.timeanddate.com/worldclock/georgia/tbilisi");
var s1 = htmlDocument.DocumentNode.Descendants().Where(x => x.HasAttributes && x.Attributes[0].Value == "ct");
But when i want to search atribute with string indexer i get an exeption.
var s2 = htmlDocument.DocumentNode.Descendants().Where(a => a.HasAttributes && a.Attributes["id"].Value == "ct");
And when i dont use LINQ and use predicate delegate everithing is Ok.
Predicate<HtmlNode> pred = new Predicate<HtmlNode>(forpred);
List<HtmlNode> ss = htmlDocument.DocumentNode.Descendants().ToList().FindAll(pred);
public static bool forpred(HtmlNode node)
{
if (node.HasAttributes)
{
foreach (HtmlAttribute atribute in node.Attributes)
{
if (atribute.Name == "id" && atribute.Value == "ct")
{
return true;
}
}
}
return false;
}
//s1.ToList()[0].InnerHtml
//s2.ToList()[0].InnerHtml
//ss[0].InnerHtml
Because some spans have attributes but not id. Your code can be like this:
var s2 = htmlDocument.DocumentNode
.Descendants()
.Where(a => a.Attributes["id"]!=null && a.Attributes["id"].Value == "ct")
.ToList();
I'm trying to do something like this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name")); // Error Occurring in here //
foreach (var anchor in anchors)
{
Console.WriteLine(anchor.InnerHtml);
}
}
}
I'm getting results like this :
New York Marriott Marquis
<span class="external-link">
<img height="15" src="/images/sprites/search/icon-link-external.png" width="16">
</span>
And
Courtyard by Marriott New York Manhattan/Times Square South
And so on.
Now I want the innerHtml of the anchors tags having class="url redbold mip-link". So I'm doing this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name"));
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
var hoteltags = htmlDoc.DocumentNode.SelectNodes("//a");
foreach (var tag in hoteltags)
{
if (!string.IsNullOrEmpty(tag.InnerHtml) || !string.IsNullOrWhiteSpace(tag.InnerHtml))
{
Console.WriteLine(tag.InnerHtml);
}
}
}
}
}
I' getting the first result properly which is New York Marriott Marquis but in the second result an error occurring :
startIndex cannot be larger than length of string. What am I doing wrong ??
You are using the same DOM object for all your operations:
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
And after that you are using the same object for loading anchor tags:
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
Just change the document in the second iterator and it should work as expected.
foreach (var anchor in anchors)
{
var htmlDocAnchor= new HtmlDocument();
htmlDocAnchor.LoadHtml(anchor.InnerHtml);// And etc..
<html>
<A NAME="doc_id_1"></A>
<div class="find1">
Iam here, extract me.
</div>
<div class = "find2">
iam here also, extract me as well.
</div>
<A NAME="doc_id_2"></A>
</html>
I have used below code to extract data :
var nodes = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_1']");
var nodes1 = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_2']");
foreach (HtmlNode node in nodes)
{
string yourText1 = node.InnerText;
//var yourText2 = node.NextSibling.SelectNodes("//div");
string yourText2 = node.NextSibling.InnerHtml;
//foreach (HtmlNode var in yourText2)
//{
// string yourText3 = var.InnerHtml;
//}
}
I don't want to give class name on those div. Because i am writing a generic code.Any help will be appreciated.
One option, using Linq:
var doc = new HtmlDocument();
doc.LoadHtml(html: Resources.Html);
var startNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var parent = startNode.ParentNode;
var nodesYouWant = parent.ChildNodes
.SkipWhile(node => node != startNode) // skip all nodes up to the start node
.Skip(1) // skip the start node
.TakeWhile(node => node != endNode) // take all nodes up to the next anchor
.Where(node => node.Name == "div"); // select only div nodes
Or:
var currentNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var nodesYouWant = GetEnclosedNodes(currentNode, endNode).Where(node => node.Name == "div");
private static IEnumerable<HtmlNode> GetEnclosedNodes(HtmlNode currentNode, HtmlNode endNode)
{
currentNode = currentNode.NextSibling;
while (currentNode != null && currentNode != endNode)
{
yield return currentNode;
currentNode = currentNode.NextSibling;
}
}
I'm assuming you will know the name value of the two anchor tags.
var doc = new HtmlDocument();
var firstAnchor = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var div = firstAnchor.NextSibling;
while (div.Name != "doc_id_2") //when the name of the second anchor is found we have no more divs
{
var divText = div.InnerText; //do whatever with this
div = div.NextSibling;
}
I wonder how could i remove the html tags using htmlagilitypack as below ?
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(Description);
//markups to be removed
var markups = new List<string> { "br","ol","ul","li" };
thanks
you can use this method
public static string RemoveHTMLTags(string content)
{
var cleaned = string.Empty;
try
{
string textOnly = string.Empty;
Regex tagRemove = new Regex(#"<[^>]*(>|$)");
Regex compressSpaces = new Regex(#"[\s\r\n]+");
textOnly = tagRemove.Replace(content, string.Empty);
textOnly = compressSpaces.Replace(textOnly, " ");
cleaned = textOnly;
}
catch
{
//A tag is probably not closed. fallback to regex string clean.
}
return cleaned;
}
//markups to be removed
var markups = new List<string> { "br", "ol", "ul", "li" };
var xpath = String.Join(" | ", markups.Select(x => "//" + x));
var nodes = htmlDoc.DocumentNode.SelectNodes(xpath);
if (nodes != null)
{
foreach (var node in nodes)
{
node.Remove();
}
}