ParseHTML parse = new ParseHTML();
parse.Source = page;
while(!parse.EOF())
{
char ch = parse.Parse();
if(ch ==0)
{
AttributeList tag = parse.GetTag();
if(tag["href"]!=null)
System.Console.WriteLine("Found link : " +tag["href"].Value);
}
}
ParseHTML 객체는 인스턴스화 되었고, 객체 소스 프로퍼티는 파싱될 HTML page 로 지정되었다.
루프는 페이지가 끝날때 까지 계속되고, 각태그를 찾을 때까지 다른 일반적인 문자들은 무시된다.
(ch ==0 이 될때까지.)
각 tag 에서 href attribute를 찾아서, 해당링크를 보여주게 된다.
[ The Attribute Class ]
이클래스는 각 HTML Attribute를 저장하기 위해서 사용한다.
예를 들어
<img src="picture.gif" alt="some Picture">
이 tag의 경우, attribute 는 src 와 alt 가 되고,
각 attribute 의 value 는 picture.gif 와 some Picture 이 된다.
attribute는 3가지 property를 가지는데, name, value, delim 이다.
name 은 attribute의 이름, value 는 그 값을 저장하고, delim은 value의 한계를 정하는데 사용된다.
using System;
namespace HTML
{
/// <summary>
/// Attribute holds one attribute, as is normally stored in an
/// HTML or XML file. This includes a name, value and delimiter.
/// This source code may be used freely under the
/// Limited GNU Public License(LGPL).
///
/// Written by Jeff Heaton (http://www.jeffheaton.com)
/// </summary>
public class Attribute: ICloneable
{
/// <summary>
/// The name of this attribute
/// </summary>
private string m_name;
/// <summary>
/// The value of this attribute
/// </summary>
private string m_value;
/// <summary>
/// The delimiter for the value of this attribute(i.e. " or ').
/// </summary>
private char m_delim;
/// <summary>
/// Construct a new Attribute. The name, delim, and value
/// properties can be specified here.
/// </summary>
/// <param name="name">The name of this attribute.</param>
/// <param name="value">The value of this attribute.</param>
/// <param name="delim">The delimiter character for the value.
/// </param>
public Attribute(string name,string value,char delim)
{
m_name = name;
m_value = value;
m_delim = delim;
}
/// <summary>
/// The default constructor. Construct a blank attribute.
/// </summary>
public Attribute():this("","",(char)0)
{
}
/// <summary>
/// Construct an attribute without a delimiter.
/// </summary>
/// <param name="name">The name of this attribute.</param>
/// <param name="value">The value of this attribute.</param>
public Attribute(String name,String value):this(name,value,
(char)0)
{
}
/// <summary>
/// The delimiter for this attribute.
/// </summary>
public char Delim
{
get
{
return m_delim;
}
set
{
m_delim = value;
}
}
/// <summary>
/// The name for this attribute.
/// </summary>
public string Name
{
get
{
return m_name;
}
set
{
m_name = value;
}
}
/// <summary>
/// The value for this attribute.
/// </summary>
public string Value
{
get
{
return m_value;
}
set
{
m_value = value;
}
}
#region ICloneable Members
public virtual object Clone()
{
return new Attribute(m_name,m_value,m_delim);
}
#endregion
}
}
The AttributeList Class
HTML의 경우 종종 몇개의 Attribute로 이루어지는데,
이 클래스는 그런 Attibute의 리스트를 저장한다.
이 AttributeList 클래스는 name 과 attribute의 collection으로 이루어진다.
AttributeList 는 숫자와 인덱스로 각각의 attribute에 접근이 가능하다.
위 예에서, attribute 인 src 같은 경우 AttributeList에서 접근을 하려면
theTag[0]
theTag["src"]
로 가능하다.
using System;
using System.Collections;
namespace HTML
{
/// <summary>
/// The AttributeList class is used to store list of
/// Attribute classes.
/// This source code may be used freely under the
/// Limited GNU Public License(LGPL).
///
/// Written by Jeff Heaton (http://www.jeffheaton.com)
/// </summary>
///
public class AttributeList:Attribute
{
/// <summary>
/// An internally used Vector. This vector contains
/// the entire list of attributes.
/// </summary>
protected ArrayList m_list;
/// <summary>
/// Make an exact copy of this object using the cloneable
/// interface.
/// </summary>
/// <returns>A new object that is a clone of the specified
/// object.</returns>
public override Object Clone()
{
AttributeList rtn = new AttributeList();
for ( int i=0;i<m_list.Count;i++ )
rtn.Add( (Attribute)this[i].Clone() );
return rtn;
}
/// <summary>
/// Create a new, empty, attribute list.
/// </summary>
public AttributeList():base("","")
{
m_list = new ArrayList();
}
/// <summary>
/// Add the specified attribute to the list of attributes.
/// </summary>
/// <param name="a">An attribute to add to this
/// AttributeList.</paramv
public void Add(Attribute a)
{
m_list.Add(a);
}
/// <summary>
/// Clear all attributes from this AttributeList and return
/// it to a empty state.
/// </summary>
public void Clear()
{
m_list.Clear();
}
/// <summary>
/// Returns true of this AttributeList is empty, with no
/// attributes.
/// </summary>
/// <returns>True if this AttributeList is empty, false
/// otherwise.</returns>
public bool IsEmpty()
{
return( m_list.Count<=0);
}
/// <summary>
/// If there is already an attribute with the specified name,
/// it will have its value changed to match the specified
/// value. If there is no Attribute with the specified name,
/// one will be created. This method is case-insensitive.
/// </summary>
/// <param name="name">The name of the Attribute to edit or
/// create. Case-insensitive.</param>
/// <param name="value">The value to be held in this
/// attribute.</param>
public void Set(string name,string value)
{
if ( name==null )
return;
if ( value==null )
value="";
Attribute a = this[name];
if ( a==null )
{
a = new Attribute(name,value);
Add(a);
}
else
a.Value = value;
}
/// <summary>
/// How many attributes are in this AttributeList?
/// </summary>
public int Count
{
get
{
return m_list.Count;
}
}
/// <summary>
/// A list of the attributes in this AttributeList
/// </summary>
public ArrayList List
{
get
{
return m_list;
}
}
/// <summary>
/// Access the individual attributes
/// </summary>
public Attribute this[int index]
{
get
{
if ( index<m_list.Count )
return(Attribute)m_list[index];
else
return null;
}
}
/// <summary>
/// Access the individual attributes by name.
/// </summary>
public Attribute this[string index]
{
get
{
int i=0;
while ( this[i]!=null )
{
if ( this[i].Name.ToLower().Equals( (index.ToLower()) ))
return this[i];
i++;
}
return null;
}
}
}
}
[ The Parse Class ]
HTML을 파싱하려고 클래스를 사용하려고 한다면, 이 Parse 클래스를 사용할 필요가 없다.
이 Parse 클래스는 HTML, SGML, XML나 HTTP header 같은 attribute-value 를 기반으로 하는 파일들을 low-level에서 제공하기 위해 HTML parser 내부적으로 사용이 되어지는 클래스이기 때문이다.
using System;
namespace HTML
{
/// <summary>
/// Base class for parsing tag based files, such as HTML,
/// HTTP headers, or XML.
///
/// This source code may be used freely under the
/// Limited GNU Public License(LGPL).
///
/// Written by Jeff Heaton (http://www.jeffheaton.com)
/// </summary>
public class Parse:AttributeList
{
/// <summary>
/// The source text that is being parsed.
/// </summary>
private string m_source;
/// <summary>
/// The current position inside of the text that
/// is being parsed.
/// </summary>
private int m_idx;
/// <summary>
/// The most recently parsed attribute delimiter.
/// </summary>
private char m_parseDelim;
/// <summary>
/// This most recently parsed attribute name.
/// </summary>
private string m_parseName;
/// <summary>
/// The most recently parsed attribute value.
/// </summary>
private string m_parseValue;
/// <summary>
/// The most recently parsed tag.
/// </summary>
public string m_tag;
/// <summary>
/// Determine if the specified character is whitespace or not.
/// </summary>
/// <param name="ch">A character to check</param>
/// <returns>true if the character is whitespace</returns>
public static bool IsWhiteSpace(char ch)
{
return( "\t\n\r ".IndexOf(ch) != -1 );
}
/// <summary>
/// Advance the index until past any whitespace.
/// </summary>
public void EatWhiteSpace()
{
while ( !Eof() )
{
if ( !IsWhiteSpace(GetCurrentChar()) )
return;
m_idx++;
}
}
/// <summary>
/// Determine if the end of the source text has been reached.
/// </summary>
/// <returns>True if the end of the source text has been
/// reached.</returns>
public bool Eof()
{
return(m_idx>=m_source.Length );
}
/// <summary>
/// Parse the attribute name.
/// </summary>
public void ParseAttributeName()
{
EatWhiteSpace();
// get attribute name
while ( !Eof() )
{
if ( IsWhiteSpace(GetCurrentChar()) ||
(GetCurrentChar()=='=') ||
(GetCurrentChar()=='>') )
break;
m_parseName+=GetCurrentChar();
m_idx++;
}
EatWhiteSpace();
}
/// <summary>
/// Parse the attribute value
/// </summary>
public void ParseAttributeValue()
{
if ( m_parseDelim!=0 )
return;
if ( GetCurrentChar()=='=' )
{
m_idx++;
EatWhiteSpace();
if ( (GetCurrentChar()=='\'') ||
(GetCurrentChar()=='\"') )
{
m_parseDelim = GetCurrentChar();
m_idx++;
while ( GetCurrentChar()!=m_parseDelim )
{
m_parseValue+=GetCurrentChar();
m_idx++;
}
m_idx++;
}
else
{
while ( !Eof() &&
!IsWhiteSpace(GetCurrentChar()) &&
(GetCurrentChar()!='>') )
{
m_parseValue+=GetCurrentChar();
m_idx++;
}
}
EatWhiteSpace();
}
}
/// <summary>
/// Add a parsed attribute to the collection.
/// </summary>
public void AddAttribute()
{
Attribute a = new Attribute(m_parseName,
m_parseValue,m_parseDelim);
Add(a);
}
/// <summary>
/// Get the current character that is being parsed.
/// </summary>
/// <returns></returns>
public char GetCurrentChar()
{
return GetCurrentChar(0);
}
/// <summary>
/// Get a few characters ahead of the current character.
/// </summary>
/// <param name="peek">How many characters to peek ahead
/// for.</param>
/// <returns>The character that was retrieved.</returns>
public char GetCurrentChar(int peek)
{
if( (m_idx+peek)<m_source.Length )
return m_source[m_idx+peek];
else
return (char)0;
}
/// <summary>
/// Obtain the next character and advance the index by one.
/// </summary>
/// <returns>The next character</returns>
public char AdvanceCurrentChar()
{
return m_source[m_idx++];
}
/// <summary>
/// Move the index forward by one.
/// </summary>
public void Advance()
{
m_idx++;
}
/// <summary>
/// The last attribute name that was encountered.
/// <summary>
public string ParseName
{
get
{
return m_parseName;
}
set
{
m_parseName = value;
}
}
/// <summary>
/// The last attribute value that was encountered.
/// <summary>
public string ParseValue
{
get
{
return m_parseValue;
}
set
{
m_parseValue = value;
}
}
/// <summary>
/// The last attribute delimeter that was encountered.
/// <summary>
public char ParseDelim
{
get
{
return m_parseDelim;
}
set
{
m_parseDelim = value;
}
}
/// <summary>
/// The text that is to be parsed.
/// <summary>
public string Source
{
get
{
return m_source;
}
set
{
m_source = value;
}
}
}
}
[ The ParseHTML Class ]
이 ParseHTML 클래스는 Parse 클래스의 subclass 이다. 이 ParseHTML 클래스는 HTML 파싱을 하기 위한 특별한 코드들을 제공한다.
이 ParseHTML 클래스는 HTML parser 의 주 인터페이스가 될것이다.
주로 사용될 메소드는
public char Parse()
public AttributeList GetTag()
Parse 메소드의 경우, HTML 에서 파싱한 다음 문자를 회수하게 된다.
만약, 다음 문자가 tag의 부분이라면 0 이 리턴된다.
Parse() 가 0 을 리턴한다면, HTML tag를 처리해야 한다.
tag에 접근을 하려면 GetTag 메소드를 사용하면 된다.
GetTag() 메소드의 경우 ArrayList 객체를 리턴하게 되는데,
이 객체에는 tag과 attribute가 전부 들어가 있다.
using System;
namespace HTML
{
/// <summary>
/// Summary description for ParseHTML.
/// </summary>
public class ParseHTML:Parse
{
public AttributeList GetTag()
{
AttributeList tag = new AttributeList();
tag.Name = m_tag;
foreach(Attribute x in List)
{
tag.Add((Attribute)x.Clone());
}
return tag;
}
public String BuildTag()
{
String buffer="<";
buffer+=m_tag;
int i=0;
while ( this[i]!=null )
{// has attributes
buffer+=" ";
if ( this[i].Value == null )
{
if ( this[i].Delim!=0 )
buffer+=this[i].Delim;
buffer+=this[i].Name;
if ( this[i].Delim!=0 )
buffer+=this[i].Delim;
}
else
{
buffer+=this[i].Name;
if ( this[i].Value!=null )
{
buffer+="=";
if ( this[i].Delim!=0 )
buffer+=this[i].Delim;
buffer+=this[i].Value;
if ( this[i].Delim!=0 )
buffer+=this[i].Delim;
}
}
i++;
}
buffer+=">";
return buffer;
}
protected void ParseTag()
{
m_tag="";
Clear();
// Is it a comment?
if ( (GetCurrentChar()=='!') &&
(GetCurrentChar(1)=='-')&&
(GetCurrentChar(2)=='-') )
{
while ( !Eof() )
{
if ( (GetCurrentChar()=='-') &&
(GetCurrentChar(1)=='-')&&
(GetCurrentChar(2)=='>') )
break;
if ( GetCurrentChar()!='\r' )
m_tag+=GetCurrentChar();
Advance();
}
m_tag+="--";
Advance();
Advance();
Advance();
ParseDelim = (char)0;
return;
}
// Find the tag name
while ( !Eof() )
{
if ( IsWhiteSpace(GetCurrentChar()) ||
(GetCurrentChar()=='>') )
break;
m_tag+=GetCurrentChar();
Advance();
}
EatWhiteSpace();
// Get the attributes
while ( GetCurrentChar()!='>' )
{
ParseName = "";
ParseValue = "";
ParseDelim = (char)0;
ParseAttributeName();
if ( GetCurrentChar()=='>' )
{
AddAttribute();
break;
}
// Get the value(if any)
ParseAttributeValue();
AddAttribute();
}
Advance();
}
public char Parse()
{
if( GetCurrentChar()=='<' )
{
Advance();
char ch=char.ToUpper(GetCurrentChar());
if ( (ch>='A') && (ch<='Z') || (ch=='!') || (ch=='/') )
{
ParseTag();
return (char)0;
}
else return(AdvanceCurrentChar());
}
else return(AdvanceCurrentChar());
}
}
}
[ The FindLinks Class ]
using System;
using System.Net;
using System.IO;
namespace HTML
{
/// <summary>
/// FindLinks is a class that will test the HTML parser.
/// This short example will prompt for a URL and then
/// scan that URL for links.
/// This source code may be used freely under the
/// Limited GNU Public License(LGPL).
///
/// Written by Jeff Heaton (http://www.jeffheaton.com)
/// </summary>
//class FindLinks
//{
/// <summary>
/// The main entry point for the application.
/// </summary>
[STAThread]
static void Main(string[] args)
{
System.Console.Write("Enter a URL address:");
string url = System.Console.ReadLine();
System.Console.WriteLine("Scanning hyperlinks at: " + url );
string page = GetPage(url);
if(page==null)
{
System.Console.WriteLine("Can't process that type of file,"
+
"please specify an HTML file URL."
);
return;
}
ParseHTML parse = new ParseHTML();
parse.Source = page;
while( !parse.Eof() )
{
char ch = parse.Parse();
if(ch==0)
{
AttributeList tag = parse.GetTag();
if( tag["href"]!=null )
System.Console.WriteLine( "Found link: " +
tag["href"].Value );
}
}
}
public static string GetPage(string url)
{
WebResponse response = null;
Stream stream = null;
StreamReader
reader = null;
try
{
HttpWebRequest request =
(HttpWebRequest)WebRequest.Create(url);
response = request.GetResponse();
stream = response.GetResponseStream();
if( !response.ContentType.ToLower().StartsWith("text/") )
return null;
string buffer = "",line;
reader = new StreamReader(stream);
while( (line = reader.ReadLine())!=null )
{
buffer+=line+"\r\n";
}
return buffer;
}
catch(WebException e)
{
System.Console.WriteLine("Can't download:" + e);
return null;
}
catch(IOException e)
{
System.Console.WriteLine("Can't download:" + e);
return null;
}
finally
{
if( reader!=null )
reader.Close();
if( stream!=null )
stream.Close();
if( response!=null )
response.Close();
}
}
}
}