Parsing HTML in Microsoft C#

출처 : http://www.developer.com/net/csharp/article.php/2230091/Parsing-HTML-in-Microsoft-C.htm

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using HTML;

using System.IO;

namespace ParseHTMLTester

{

class Program

{

static void Main(string[] args)

{

StreamReader reader = new StreamReader("1.txt");// html을 1.txt로 저장해 놓음

HTML.ParseHTML parse = new HTML.ParseHTML();

// 스트림 리터에서 파일 전체를 string으로 변경

string data = reader.ReadToEnd();

parse.Source = data;

while (!parse.Eof())

{

char ch = parse.Parse();

if (ch == 0)

{

AttributeList tag = parse.GetTag();

if (tag.Name == "th")

{

// th일 경우 class의 값을 프린트

if (tag["class"] != null)

{

Console.WriteLine("th class=" + tag["class"].Value);

}

}

else

{

// 일반 태그는 그냥 프린트

Console.WriteLine(tag.Name);

}

}

}

}

}

}

[ HTML Parser 사용법 ]

ParseHTML parse = new ParseHTML();
parse.Source = page;
while(!parse.EOF())
{
   char ch = parse.Parse();
   if(ch ==0)
   {
   AttributeList tag = parse.GetTag();
   if(tag["href"]!=null)
   System.Console.WriteLine("Found link : " +tag["href"].Value);
   }
}

ParseHTML 객체는 인스턴스화 되었고, 객체 소스 프로퍼티는 파싱될 HTML page 로 지정되었다.
루프는 페이지가 끝날때 까지 계속되고, 각태그를 찾을 때까지 다른 일반적인 문자들은 무시된다.
(ch ==0 이 될때까지.)
각 tag 에서 href attribute를 찾아서, 해당링크를 보여주게 된다.

[ The Attribute Class ]

이클래스는 각 HTML Attribute를 저장하기 위해서 사용한다.

예를 들어

<img src="picture.gif" alt="some Picture">

이 tag의 경우, attribute 는 src 와 alt 가 되고,
각 attribute 의 value 는 picture.gif 와 some Picture 이 된다.

attribute는 3가지 property를 가지는데, name, value, delim 이다.
name 은 attribute의 이름, value 는 그 값을 저장하고, delim은 value의 한계를 정하는데 사용된다.

using System;

namespace HTML
{
  /// <summary>
  /// Attribute holds one attribute, as is normally stored in an
  /// HTML or XML file. This includes a name, value and delimiter.
  /// This source code may be used freely under the
  /// Limited GNU Public License(LGPL).
  ///
  /// Written by Jeff Heaton (http://www.jeffheaton.com)
  /// </summary>
  public class Attribute: ICloneable
  {
    /// <summary>
    /// The name of this attribute
    /// </summary>
    private string m_name;

    /// <summary>
    /// The value of this attribute
    /// </summary>
    private string m_value;

    /// <summary>
    /// The delimiter for the value of this attribute(i.e. " or ').
    /// </summary>
    private char m_delim;

    /// <summary>
    /// Construct a new Attribute.  The name, delim, and value
    /// properties can be specified here.
    /// </summary>
    /// <param name="name">The name of this attribute.</param>
    /// <param name="value">The value of this attribute.</param>
    /// <param name="delim">The delimiter character for the value.
    /// </param>
    public Attribute(string name,string value,char delim)
    {
      m_name  = name;
      m_value = value;
      m_delim = delim;
    }


    /// <summary>
    /// The default constructor.  Construct a blank attribute.
    /// </summary>
    public Attribute():this("","",(char)0)
    {
    }

    /// <summary>
    /// Construct an attribute without a delimiter.
    /// </summary>
    /// <param name="name">The name of this attribute.</param>
    /// <param name="value">The value of this attribute.</param>
    public Attribute(String name,String value):this(name,value,
                                                   (char)0)
    {
    }

    /// <summary>
    /// The delimiter for this attribute.
    /// </summary>
    public char Delim
    {
      get
      {
        return m_delim;
      }

      set
      {
        m_delim = value;
      }
    }

    /// <summary>
    /// The name for this attribute.
    /// </summary>
    public string Name
    {
      get
      {
        return m_name;
      }

      set
      {
        m_name = value;
      }
    }

    /// <summary>
    /// The value for this attribute.
    /// </summary>
    public string Value
    {
      get
      {
        return m_value;
      }

      set
      {
        m_value = value;
      }
    }

    #region ICloneable Members
    public virtual object Clone()
    {
      return new Attribute(m_name,m_value,m_delim);
    }
    #endregion
  }
}

The AttributeList Class

HTML의 경우 종종 몇개의 Attribute로 이루어지는데,
이 클래스는 그런 Attibute의 리스트를 저장한다.

이 AttributeList 클래스는 name 과 attribute의 collection으로 이루어진다.
AttributeList 는 숫자와 인덱스로 각각의 attribute에 접근이 가능하다.
위 예에서, attribute 인 src 같은 경우 AttributeList에서 접근을 하려면

theTag[0]
theTag["src"]

로 가능하다.

using System;
using System.Collections;

namespace HTML
{
  /// <summary>
  /// The AttributeList class is used to store list of
  /// Attribute classes.
  /// This source code may be used freely under the
  /// Limited GNU Public License(LGPL).
  ///
  /// Written by Jeff Heaton (http://www.jeffheaton.com)
  /// </summary>
  ///
  public class AttributeList:Attribute
  {
    /// <summary>
    /// An internally used Vector.  This vector contains
    /// the entire list of attributes.
    /// </summary>
    protected ArrayList m_list;
    /// <summary>
    /// Make an exact copy of this object using the cloneable
    /// interface.
    /// </summary>
    /// <returns>A new object that is a clone of the specified
    /// object.</returns>
    public override Object Clone()
    {
      AttributeList rtn = new AttributeList();

      for ( int i=0;i<m_list.Count;i++ )
        rtn.Add( (Attribute)this[i].Clone() );

      return rtn;
    }

    /// <summary>
    /// Create a new, empty, attribute list.
    /// </summary>
    public AttributeList():base("","")
    {
      m_list = new ArrayList();
    }


    /// <summary>
    /// Add the specified attribute to the list of attributes.
    /// </summary>
    /// <param name="a">An attribute to add to this
    /// AttributeList.</paramv
    public void Add(Attribute a)
    {
      m_list.Add(a);
    }


    /// <summary>
    /// Clear all attributes from this AttributeList and return
    /// it to a empty state.
    /// </summary>
    public void Clear()
    {
      m_list.Clear();
    }

    /// <summary>
    /// Returns true of this AttributeList is empty, with no
    /// attributes.
    /// </summary>
    /// <returns>True if this AttributeList is empty, false
    /// otherwise.</returns>
    public bool IsEmpty()
    {
      return( m_list.Count<=0);
    }

    /// <summary>
    /// If there is already an attribute with the specified name,
    /// it will have its value changed to match the specified
    /// value. If there is no Attribute with the specified name,
    /// one will be created. This method is case-insensitive.
    /// </summary>
    /// <param name="name">The name of the Attribute to edit or
    /// create.  Case-insensitive.</param>
    /// <param name="value">The value to be held in this
    /// attribute.</param>
    public void Set(string name,string value)
    {
      if ( name==null )
        return;
      if ( value==null )
        value="";

      Attribute a = this[name];

      if ( a==null )

      {
        a = new Attribute(name,value);
        Add(a);
      }

      else
        a.Value = value;
    }

    /// <summary>
    /// How many attributes are in this AttributeList?
    /// </summary>
    public int Count
    {
      get
      {
        return m_list.Count;
      }
    }

    /// <summary>
    /// A list of the attributes in this AttributeList
    /// </summary>
    public ArrayList List
    {
      get
      {
        return m_list;
      }
    }

    /// <summary>
    /// Access the individual attributes
    /// </summary>
    public Attribute this[int index]
    {
      get
      {
        if ( index<m_list.Count )
          return(Attribute)m_list[index];
        else
          return null;
      }
    }

    /// <summary>
    /// Access the individual attributes by name.
    /// </summary>
    public Attribute this[string index]
    {
      get
      {
        int i=0;

        while ( this[i]!=null )
        {
          if ( this[i].Name.ToLower().Equals( (index.ToLower()) ))
            return this[i];
          i++;
        }

        return null;

      }
    }
  }
}

[ The Parse Class ]

HTML을 파싱하려고 클래스를 사용하려고 한다면, 이 Parse 클래스를 사용할 필요가 없다.
이 Parse 클래스는 HTML, SGML, XML나 HTTP header 같은 attribute-value 를 기반으로 하는 파일들을 low-level에서 제공하기 위해 HTML parser 내부적으로 사용이 되어지는 클래스이기 때문이다.

using System;

namespace HTML
{
  /// <summary>
  /// Base class for parsing tag based files, such as HTML,
  /// HTTP headers, or XML.
  ///
  /// This source code may be used freely under the
  /// Limited GNU Public License(LGPL).
  ///
  /// Written by Jeff Heaton (http://www.jeffheaton.com)
  /// </summary>
  public class Parse:AttributeList
  {
    /// <summary>
    /// The source text that is being parsed.
    /// </summary>
    private string m_source;

    /// <summary>
    /// The current position inside of the text that
    /// is being parsed.
    /// </summary>
    private int m_idx;

    /// <summary>
    /// The most recently parsed attribute delimiter.
    /// </summary>
    private char m_parseDelim;

    /// <summary>
    /// This most recently parsed attribute name.
    /// </summary>
    private string m_parseName;

    /// <summary>
    /// The most recently parsed attribute value.
    /// </summary>
    private string m_parseValue;

    /// <summary>
    /// The most recently parsed tag.
    /// </summary>
    public string m_tag;

    /// <summary>
    /// Determine if the specified character is whitespace or not.
    /// </summary>
    /// <param name="ch">A character to check</param>
    /// <returns>true if the character is whitespace</returns>
    public static bool IsWhiteSpace(char ch)
    {
      return( "\t\n\r ".IndexOf(ch) != -1 );
    }


    /// <summary>
    /// Advance the index until past any whitespace.
    /// </summary>
    public void EatWhiteSpace()
    {
      while ( !Eof() )
      {
        if ( !IsWhiteSpace(GetCurrentChar()) )
          return;
        m_idx++;
      }
    }

    /// <summary>
    /// Determine if the end of the source text has been reached.
    /// </summary>
    /// <returns>True if the end of the source text has been
    /// reached.</returns>
    public bool Eof()
    {
      return(m_idx>=m_source.Length );
    }

    /// <summary>
    /// Parse the attribute name.
    /// </summary>
    public void ParseAttributeName()
    {
      EatWhiteSpace();
      // get attribute name
      while ( !Eof() )
      {
        if ( IsWhiteSpace(GetCurrentChar()) ||
          (GetCurrentChar()=='=') ||
          (GetCurrentChar()=='>') )
          break;
        m_parseName+=GetCurrentChar();
        m_idx++;
      }

      EatWhiteSpace();
    }


    /// <summary>
    /// Parse the attribute value
    /// </summary>
    public void ParseAttributeValue()
    {
      if ( m_parseDelim!=0 )
        return;

      if ( GetCurrentChar()=='=' )
      {
        m_idx++;
        EatWhiteSpace();
        if ( (GetCurrentChar()=='\'') ||
          (GetCurrentChar()=='\"') ) 
        {
          m_parseDelim = GetCurrentChar();
          m_idx++;
          while ( GetCurrentChar()!=m_parseDelim )
          {
            m_parseValue+=GetCurrentChar();
            m_idx++;
          }
          m_idx++;
        }
        else
        {
          while ( !Eof() &&
            !IsWhiteSpace(GetCurrentChar()) &&
            (GetCurrentChar()!='>') )

          {
            m_parseValue+=GetCurrentChar();
            m_idx++;
          }
        }
        EatWhiteSpace();
      }
    }

    /// <summary>
    /// Add a parsed attribute to the collection.
    /// </summary>
    public void AddAttribute()
    {
      Attribute a = new Attribute(m_parseName,
        m_parseValue,m_parseDelim);
      Add(a);
    }


    /// <summary>
    /// Get the current character that is being parsed.
    /// </summary>
    /// <returns></returns>
    public char GetCurrentChar()

    {

      return GetCurrentChar(0);

    }



    /// <summary>
    /// Get a few characters ahead of the current character.
    /// </summary>
    /// <param name="peek">How many characters to peek ahead
    /// for.</param>
    /// <returns>The character that was retrieved.</returns>
    public char GetCurrentChar(int peek)

    {
      if( (m_idx+peek)<m_source.Length )
        return m_source[m_idx+peek];
      else
        return (char)0;
    }



    /// <summary>
    /// Obtain the next character and advance the index by one.
    /// </summary>
    /// <returns>The next character</returns>
    public char AdvanceCurrentChar()

    {
      return m_source[m_idx++];
    }



    /// <summary>
    /// Move the index forward by one.
    /// </summary>
    public void Advance()
    {
      m_idx++;
    }


    /// <summary>
    /// The last attribute name that was encountered.
    /// <summary>
    public string ParseName
    {
      get
      {
        return m_parseName;
      }

      set
      {
        m_parseName = value;
      }
    }

    /// <summary>
    /// The last attribute value that was encountered.
    /// <summary>
    public string ParseValue
    {
      get
      {
        return m_parseValue;
      }

      set
      {
        m_parseValue = value;
      }
    }

    /// <summary>
    /// The last attribute delimeter that was encountered.
    /// <summary>
    public char ParseDelim
    {
      get
      {
        return m_parseDelim;
      }

      set
      {
        m_parseDelim = value;
      }
    }

    /// <summary>
    /// The text that is to be parsed.
    /// <summary>
    public string Source
    {
      get
      {
        return m_source;
      }

      set
      {
        m_source = value;
      }
    }
  }
}

[ The ParseHTML Class ]

이 ParseHTML 클래스는 Parse 클래스의 subclass 이다. 이 ParseHTML 클래스는 HTML 파싱을 하기 위한 특별한 코드들을 제공한다.
이 ParseHTML 클래스는 HTML parser 의 주 인터페이스가 될것이다.
주로 사용될 메소드는

public char Parse()
public AttributeList GetTag()

Parse 메소드의 경우, HTML 에서 파싱한 다음 문자를 회수하게 된다.
만약, 다음 문자가 tag의 부분이라면 0 이 리턴된다.
Parse() 가 0 을 리턴한다면, HTML tag를 처리해야 한다.
tag에 접근을 하려면 GetTag 메소드를 사용하면 된다.
GetTag() 메소드의 경우 ArrayList 객체를 리턴하게 되는데,
이 객체에는 tag과 attribute가 전부 들어가 있다.

using System;

namespace HTML
{
  /// <summary>
  /// Summary description for ParseHTML.
  /// </summary>

  public class ParseHTML:Parse
  {
    public AttributeList GetTag()
    {
      AttributeList tag = new AttributeList();
      tag.Name = m_tag;

      foreach(Attribute x in List)
      {
        tag.Add((Attribute)x.Clone());
      }

      return tag;
    }

    public String BuildTag()
    {
      String buffer="<";
      buffer+=m_tag;
      int i=0;
      while ( this[i]!=null )

      {// has attributes
        buffer+=" ";
        if ( this[i].Value == null )
        {
          if ( this[i].Delim!=0 )
            buffer+=this[i].Delim;
          buffer+=this[i].Name;
          if ( this[i].Delim!=0 )
            buffer+=this[i].Delim;
        }
        else
        {
          buffer+=this[i].Name;
          if ( this[i].Value!=null )
          {
            buffer+="=";
            if ( this[i].Delim!=0 )
              buffer+=this[i].Delim;
            buffer+=this[i].Value;
            if ( this[i].Delim!=0 )
              buffer+=this[i].Delim;
          }
        }
        i++;
      }
      buffer+=">";
      return buffer;
    }

    protected void ParseTag()
    {
      m_tag="";
      Clear();

      // Is it a comment?
      if ( (GetCurrentChar()=='!') &&
        (GetCurrentChar(1)=='-')&&
        (GetCurrentChar(2)=='-') )
      {
        while ( !Eof() )
        {
          if ( (GetCurrentChar()=='-') &&
            (GetCurrentChar(1)=='-')&&
            (GetCurrentChar(2)=='>') )
            break;
          if ( GetCurrentChar()!='\r' )
            m_tag+=GetCurrentChar();
          Advance();
        }
        m_tag+="--";
        Advance();
        Advance();
        Advance();
        ParseDelim = (char)0;
        return;
      }

      // Find the tag name
      while ( !Eof() )
      {
        if ( IsWhiteSpace(GetCurrentChar()) ||
                         (GetCurrentChar()=='>') )
          break;
        m_tag+=GetCurrentChar();
        Advance();
      }

      EatWhiteSpace();

      // Get the attributes
      while ( GetCurrentChar()!='>' )
      {
        ParseName  = "";
        ParseValue = "";
        ParseDelim = (char)0;

        ParseAttributeName();

        if ( GetCurrentChar()=='>' ) 

        {
          AddAttribute();
          break;
        }

        // Get the value(if any)
        ParseAttributeValue();
        AddAttribute();
      }
      Advance();
    }


    public char Parse()
    {
      if( GetCurrentChar()=='<' )
      {
        Advance();

        char ch=char.ToUpper(GetCurrentChar());
        if ( (ch>='A') && (ch<='Z') || (ch=='!') || (ch=='/') ) 
        {
          ParseTag();
          return (char)0;
        }

        else return(AdvanceCurrentChar());
      }
      else return(AdvanceCurrentChar());
    }
  }
}

[ The FindLinks Class ]

using System;
using System.Net;
using System.IO;


namespace HTML
{
  /// <summary>
  /// FindLinks is a class that will test the HTML parser.
  /// This short example will prompt for a URL and then
  /// scan that URL for links.
  /// This source code may be used freely under the
  /// Limited GNU Public License(LGPL).
  ///
  /// Written by Jeff Heaton (http://www.jeffheaton.com)
  /// </summary>
//class FindLinks
//{
    /// <summary>
    /// The main entry point for the application.
    /// </summary>
    [STAThread]
    static void Main(string[] args)
    {
      System.Console.Write("Enter a URL address:");
      string url = System.Console.ReadLine();
      System.Console.WriteLine("Scanning hyperlinks at: " + url );
      string page = GetPage(url);
      if(page==null)
      {
        System.Console.WriteLine("Can't process that type of file,"
                                  +
                                  "please specify an HTML file URL."
                                  );
        return;
      }

      ParseHTML parse = new ParseHTML();
      parse.Source = page;
      while( !parse.Eof() )
      {
        char ch = parse.Parse();
        if(ch==0)
        {
          AttributeList tag = parse.GetTag();
          if( tag["href"]!=null )
            System.Console.WriteLine( "Found link: " +
                                       tag["href"].Value );
        }
      }
    }


    public static string GetPage(string url)
    {
      WebResponse response = null;
      Stream stream = null;
      StreamReader
        reader = null;

      try
      {
        HttpWebRequest request =
                       (HttpWebRequest)WebRequest.Create(url);

        response = request.GetResponse();
        stream = response.GetResponseStream();

        if( !response.ContentType.ToLower().StartsWith("text/") )
          return null;

        string buffer = "",line;

        reader = new StreamReader(stream);

        while( (line = reader.ReadLine())!=null )
        {
          buffer+=line+"\r\n";
        }

        return buffer;
      }
      catch(WebException e)
      {
        System.Console.WriteLine("Can't download:" + e);
        return null;
      }
      catch(IOException e)
      {
        System.Console.WriteLine("Can't download:" + e);
        return null;
      }
      finally
      {
        if( reader!=null )
          reader.Close();

        if( stream!=null )
          stream.Close();

        if( response!=null )
          response.Close();
      }
    }
  }
}

미로속에 갇힌 뇌

Parsing HTML in Microsoft C#

티스토리툴바