Get plain text from HTML content
This code snippet is very useful to getting the plain text from HTML content. Most of the web pages we need to enter the long descriptions through the multiline text box and need to display that entered text into a div element. Suppose if the text has some html tags, then the tags are not displayed as like the text. It always rendered as HTML automatically. So we need to validate or remove the HTML tags from the entered text. In this code snippet, I explained how we get the plain text from html content.
Used Methods:
We can remove the HTML tags from a text using replace, remove and some regular expression methods. Using these methods we need to manually remove all the HTML tags.
Main HTML tags:
Generally, a HTML page contains the following basic tags line break, head, html, script, body, title, style, div, table, tr, td, li, ul, ol etc., So first we need to remove these basic tags. If the tags are removed then we can easily get the plain text.
Steps to remove the tags:
Step 1 : Remove the repeated spaces. This regular expression replace the repeated spaces.
result = System.Text.RegularExpressions.Regex.Replace(result,@"( )+", " ");
Step 2 : Remove the head attributes
result = System.Text.RegularExpressions.Regex.Replace(result,@"<( )*head([^>])*>", "",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,@"(<( )*(/)( )*head( )*>)", "",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,"().*()", string.Empty,System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Step 3 : Remove all the javascript tags
result = System.Text.RegularExpressions.Regex.Replace(result,@"<( )*script([^>])*>", "<script>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Step 4 : Remove all the style tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Step 5 : insert tab spaces in each td. If the text has any td tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>(<( )*br( )*(/)*>)*", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*br( )*(/)*>)*( )*])*>(<( )*br( )*(/)*>)*", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Step 6 : insert line breaks in place of br and li tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*br( )*(/)*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*li( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Step 7 : insert line paragraphs if the text has p, div or tr tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*div([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*tr([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*p([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*(ol|ul)([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
Also, we need to replace the same for other tags too. Please use the following method for getting plain text from your HTML content.
private string GetPlainTextFromHTML(string source)
{
string result = "";
try
{
result = source.Replace("\r", " ");
result = result.Replace("\n", " ");
result = result.Replace("\t", string.Empty);
result = System.Text.RegularExpressions.Regex.Replace(result,@"( )+", " ");
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"().*()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>(<( )*br( )*(/)*>)*", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*br( )*(/)*>)*( )*])*>(<( )*br( )*(/)*>)*", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*br( )*(/)*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*li( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*div([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*tr([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*p([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*(ol|ul)([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", "(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"/", "/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", "(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", "(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = result.Replace("\n", "\r");
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)*( )+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\t)", "\t\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\r)", "\t\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\t)", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"[\r]+", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"[ ]+", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = result.Replace("\r", "\n");
result = System.Text.RegularExpressions.Regex.Replace(result,
"[\n ]$", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"^[\n ]", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
return result;
}
catch
{
return result;
}
}
Reference: http://forums.asp.net/p/1480598/3454364.aspx