htmlagilitypack - remove script and style?

htmlagilitypack - remove script and style? - c#

Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());

You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());

Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}

public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}

Related

How I remove unwanted pre and code tags of html content with HtmlAgilityPack [duplicate]

Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());

You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());

Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}

public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}

HtmlAgilityPack cant get string indexer

i want to parse HTML Using HTML Agility Pack
When i am searching index with int i am getting result.
HtmlWeb htmlWeb = new HtmlWeb();
HtmlDocument htmlDocument = htmlWeb.Load("http://www.timeanddate.com/worldclock/georgia/tbilisi");
var s1 = htmlDocument.DocumentNode.Descendants().Where(x => x.HasAttributes && x.Attributes[0].Value == "ct");
But when i want to search atribute with string indexer i get an exeption.
var s2 = htmlDocument.DocumentNode.Descendants().Where(a => a.HasAttributes && a.Attributes["id"].Value == "ct");
And when i dont use LINQ and use predicate delegate everithing is Ok.
Predicate<HtmlNode> pred = new Predicate<HtmlNode>(forpred);
List<HtmlNode> ss = htmlDocument.DocumentNode.Descendants().ToList().FindAll(pred);
public static bool forpred(HtmlNode node)
{
if (node.HasAttributes)
{
foreach (HtmlAttribute atribute in node.Attributes)
{
if (atribute.Name == "id" && atribute.Value == "ct")
{
return true;
}
}
}
return false;
}
//s1.ToList()[0].InnerHtml
//s2.ToList()[0].InnerHtml
//ss[0].InnerHtml

Because some spans have attributes but not id. Your code can be like this:
var s2 = htmlDocument.DocumentNode
.Descendants()
.Where(a => a.Attributes["id"]!=null && a.Attributes["id"].Value == "ct")
.ToList();

C# HtmlAgilityPack : startIndex cannot be larger than length of string

I'm trying to do something like this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name")); // Error Occurring in here //
foreach (var anchor in anchors)
{
Console.WriteLine(anchor.InnerHtml);
}
}
}
I'm getting results like this :
New York Marriott Marquis
<span class="external-link">
<img height="15" src="/images/sprites/search/icon-link-external.png" width="16">
</span>
And
Courtyard by Marriott New York Manhattan/Times Square South
And so on.
Now I want the innerHtml of the anchors tags having class="url redbold mip-link". So I'm doing this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name"));
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
var hoteltags = htmlDoc.DocumentNode.SelectNodes("//a");
foreach (var tag in hoteltags)
{
if (!string.IsNullOrEmpty(tag.InnerHtml) || !string.IsNullOrWhiteSpace(tag.InnerHtml))
{
Console.WriteLine(tag.InnerHtml);
}
}
}
}
}
I' getting the first result properly which is New York Marriott Marquis but in the second result an error occurring :
startIndex cannot be larger than length of string. What am I doing wrong ??

You are using the same DOM object for all your operations:
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
And after that you are using the same object for loading anchor tags:
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
Just change the document in the second iterator and it should work as expected.
foreach (var anchor in anchors)
{
var htmlDocAnchor= new HtmlDocument();
htmlDocAnchor.LoadHtml(anchor.InnerHtml);// And etc..

How to extract different divs between two anchor tags in htmlagility?

<html>
<A NAME="doc_id_1"></A>
<div class="find1">
Iam here, extract me.
</div>
<div class = "find2">
iam here also, extract me as well.
</div>
<A NAME="doc_id_2"></A>
</html>
I have used below code to extract data :
var nodes = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_1']");
var nodes1 = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_2']");
foreach (HtmlNode node in nodes)
{
string yourText1 = node.InnerText;
//var yourText2 = node.NextSibling.SelectNodes("//div");
string yourText2 = node.NextSibling.InnerHtml;
//foreach (HtmlNode var in yourText2)
//{
// string yourText3 = var.InnerHtml;
//}
}
I don't want to give class name on those div. Because i am writing a generic code.Any help will be appreciated.

One option, using Linq:
var doc = new HtmlDocument();
doc.LoadHtml(html: Resources.Html);
var startNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var parent = startNode.ParentNode;
var nodesYouWant = parent.ChildNodes
.SkipWhile(node => node != startNode) // skip all nodes up to the start node
.Skip(1) // skip the start node
.TakeWhile(node => node != endNode) // take all nodes up to the next anchor
.Where(node => node.Name == "div"); // select only div nodes
Or:
var currentNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var nodesYouWant = GetEnclosedNodes(currentNode, endNode).Where(node => node.Name == "div");
private static IEnumerable<HtmlNode> GetEnclosedNodes(HtmlNode currentNode, HtmlNode endNode)
{
currentNode = currentNode.NextSibling;
while (currentNode != null && currentNode != endNode)
{
yield return currentNode;
currentNode = currentNode.NextSibling;
}
}

I'm assuming you will know the name value of the two anchor tags.
var doc = new HtmlDocument();
var firstAnchor = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var div = firstAnchor.NextSibling;
while (div.Name != "doc_id_2") //when the name of the second anchor is found we have no more divs
{
var divText = div.InnerText; //do whatever with this
div = div.NextSibling;
}

remove html tags from string using htmlagilitypack

I wonder how could i remove the html tags using htmlagilitypack as below ?
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(Description);
//markups to be removed
var markups = new List<string> { "br","ol","ul","li" };
thanks

you can use this method
public static string RemoveHTMLTags(string content)
{
var cleaned = string.Empty;
try
{
string textOnly = string.Empty;
Regex tagRemove = new Regex(#"<[^>]*(>|$)");
Regex compressSpaces = new Regex(#"[\s\r\n]+");
textOnly = tagRemove.Replace(content, string.Empty);
textOnly = compressSpaces.Replace(textOnly, " ");
cleaned = textOnly;
}
catch
{
//A tag is probably not closed. fallback to regex string clean.
}
return cleaned;
}

//markups to be removed
var markups = new List<string> { "br", "ol", "ul", "li" };
var xpath = String.Join(" | ", markups.Select(x => "//" + x));
var nodes = htmlDoc.DocumentNode.SelectNodes(xpath);
if (nodes != null)
{
foreach (var node in nodes)
{
node.Remove();
}
}

Develop Reference

C# (C-Sharp) is a programming language developed by Microsoft that runs on the .NET Framework.

htmlagilitypack - remove script and style? - c#

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); doc.DocumentNode.Descendants() .Where(n => n.Name == "script" || n.Name == "style") .ToList() .ForEach(n => n.Remove());

You can do so using HtmlDocument class: HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(input); doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());

Related

How I remove unwanted pre and code tags of html content with HtmlAgilityPack [duplicate]

HtmlAgilityPack cant get string indexer

C# HtmlAgilityPack : startIndex cannot be larger than length of string

How to extract different divs between two anchor tags in htmlagility?

remove html tags from string using htmlagilitypack

Categories

Resources