Get plain text from HTML content


This code snippet is very useful to getting the plain text from HTML content. Most of the web pages we need to enter the long descriptions through the multiline text box and need to display that entered text into a div element. Suppose if the text has some html tags, then the tags are not displayed as like the text. It always rendered as HTML automatically. So we need to validate or remove the HTML tags from the entered text. In this code snippet, I explained how we get the plain text from html content.

Used Methods:

We can remove the HTML tags from a text using replace, remove and some regular expression methods. Using these methods we need to manually remove all the HTML tags.

Main HTML tags:

Generally, a HTML page contains the following basic tags line break, head, html, script, body, title, style, div, table, tr, td, li, ul, ol etc., So first we need to remove these basic tags. If the tags are removed then we can easily get the plain text.

Steps to remove the tags:

Step 1 : Remove the repeated spaces. This regular expression replace the repeated spaces.

result = System.Text.RegularExpressions.Regex.Replace(result,@"( )+", " ");

Step 2 : Remove the head attributes


result = System.Text.RegularExpressions.Regex.Replace(result,@"<( )*head([^>])*>", "",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,@"(<( )*(/)( )*head( )*>)", "",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,"().*()", string.Empty,System.Text.RegularExpressions.RegexOptions.IgnoreCase);


Step 3 : Remove all the javascript tags


result = System.Text.RegularExpressions.Regex.Replace(result,@"<( )*script([^>])*>", "<script>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);


result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);



Step 4 : Remove all the style tags

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
"()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);


Step 5 : insert tab spaces in each td. If the text has any td tags

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>(<( )*br( )*(/)*>)*", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*br( )*(/)*>)*( )*])*>(<( )*br( )*(/)*>)*", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);


Step 6 : insert line breaks in place of br and li tags

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*br( )*(/)*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*li( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);



Step 7 : insert line paragraphs if the text has p, div or tr tags

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*div([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*tr([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*p([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);


Also, we need to replace the same for other tags too. Please use the following method for getting plain text from your HTML content.


private string GetPlainTextFromHTML(string source)
{

string result = "";

try
{
result = source.Replace("\r", " ");
result = result.Replace("\n", " ");
result = result.Replace("\t", string.Empty);
result = System.Text.RegularExpressions.Regex.Replace(result,@"( )+", " ");

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"().*()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);


result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"()", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>(<( )*br( )*(/)*>)*", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*br( )*(/)*>)*( )*])*>(<( )*br( )*(/)*>)*", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*br( )*(/)*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*li( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*div([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( )*tr([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*<( |/)*p([^>])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(\r|\n)*])*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", "(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"/", "/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", "(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", "(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = result.Replace("\n", "\r");

result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)*( )+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\t)", "\t\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\r)", "\t\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\t)", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"[\r]+", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"[ ]+", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);

result = result.Replace("\r", "\n");
result = System.Text.RegularExpressions.Regex.Replace(result,
"[\n ]$", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"^[\n ]", "",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
return result;

}
catch
{
return result;
}
}


Reference: http://forums.asp.net/p/1480598/3454364.aspx


Comments

No responses found. Be the first to comment...


  • Do not include your name, "with regards" etc in the comment. Write detailed comment, relevant to the topic.
  • No HTML formatting and links to other web sites are allowed.
  • This is a strictly moderated site. Absolutely no spam allowed.
  • Name:
    Email: