(C#닷넷동영상)C# HtmlAgilityPack을 이용한 웹크롤러(Web-Crawler) 웹페이지, HTML 파싱하기, 닷넷학원, C#학원, WPF학원,자바학원, JAVA동영상
http://ojc.asia/bbs/board.php?bo_table=LecCsharpNet&wr_id=57
ojc.asia
https://www.youtube.com/watch?v=wNo2hF0uY4U&list=PLxU-iZCqT52CA9Y474h7UbqmWqXwIZ-hl&index=6

https://www.youtube.com/watch?v=ybhnGNhhLXU&list=PLxU-iZCqT52CA9Y474h7UbqmWqXwIZ-hl&index=5


HtmlAgilityPack을 이용한 웹크롤러
웹페이지, HTML 파싱하기
HtmlAgility, WebClient를 이용한 웹크롤러 만들기(Web-Crawler)
- HtmlAgilityPack은 NET Framework의 코드 만으로 HTML 문서를 파싱하고 분석할 수 있는 도구로 닷넷 프로젝트에서 사용하려면 NuGet 패키지 관리에서 간단히 설치해서 사용하면 된다.
- System.Xml 네임스페이스에서 제공하는 XPATH를 지원하고 HTML 문서탐색을 편하게 해주고 HTML을 다운받거나 또는 HMLT을 파싱하는 방법등을 이용할 수 있다.
- HTML 노드를 탐색하기 위해서는 XPath문법을 알아야 한다. 단일노드(SelectSingleNode) 및 여러노드(SelectNode) 탐색이 가능하며 여러노드(SelectNode)를 탐색한 경우 foreach문의 반복문에서 나머지 처리를 하면 된다.
WebClient client = new WebClient(); string htmlSource = client.DownloadString("http://naver.com”); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(htmlSource); // a 태그를 순회하면서 href 속성을 읽음 HtmlNode bodyNode in doc.DocumentNode.SelectSingleNodes("//body") foreach (HtmlNode aNode in bodyNode.SelectNodes("//a")) { string hrefValue = aNode.Attributes["href"].Value; Console.WriteLine("a href: " + hrefValue); } //웹문서의 타이틀을 읽음 title.Text= doc.DocumentNode.SelectSingleNode("//head/title").InnerText; //HTML의 타이틀읽음 |
- 윈폼에서 URL을 입력받아서 전체 html 소스를 받고 html 소스중 <title> 태그를 파싱하여 타이틀을 출력하고 <a href >, <img src> 태그를 파싱하여 출력하는 예제를 만들어 보자.
- 실행화면


- 1. C#, 윈폼으로 WinFormsApp4이라는 이름의 프로젝트를 생성한다.
- 2. 프로젝트 우측 마우스 클릭 >> NuGet 패키지 관리에서 "HtmlAgility"로 검색하여 HtmlAgilityPack을 설치하자.
- 3. Form1.Designer.cs
namespace WinFormsApp4
{
partial class Form1
{
/// <summary>
/// Required designer variable.
/// </summary>
private System.ComponentModel.IContainer components = null;
/// <summary>
/// Clean up any resources being used.
/// </summary>
/// <param name="disposing">true if managed resources should be disposed; otherwise, false.</param>
protected override void Dispose(bool disposing)
{
if (disposing && (components != null))
{
components.Dispose();
}
base.Dispose(disposing);
}
#region Windows Form Designer generated code
/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.button1 = new System.Windows.Forms.Button();
this.txtURL = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.txtResult = new System.Windows.Forms.TextBox();
this.txtHyperLink = new System.Windows.Forms.TextBox();
this.txtTitle = new System.Windows.Forms.TextBox();
this.label3 = new System.Windows.Forms.Label();
this.txtImg = new System.Windows.Forms.TextBox();
this.label4 = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// button1
//
this.button1.Location = new System.Drawing.Point(42, 37);
this.button1.Name = "button1";
this.button1.Size = new System.Drawing.Size(210, 33);
this.button1.TabIndex = 0;
this.button1.Text = "웹크롤링";
this.button1.UseVisualStyleBackColor = true;
this.button1.Click += new System.EventHandler(this.button1_Click);
//
// txtURL
//
this.txtURL.Font = new System.Drawing.Font("맑은 고딕", 14.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point);
this.txtURL.Location = new System.Drawing.Point(287, 35);
this.txtURL.Name = "txtURL";
this.txtURL.Size = new System.Drawing.Size(473, 33);
this.txtURL.TabIndex = 1;
//
// label1
//
this.label1.AutoSize = true;
this.label1.Location = new System.Drawing.Point(49, 93);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(95, 15);
this.label1.TabIndex = 2;
this.label1.Text = "전체 HTML 소스";
//
// label2
//
this.label2.AutoSize = true;
this.label2.Location = new System.Drawing.Point(42, 267);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(67, 15);
this.label2.TabIndex = 3;
this.label2.Text = "하이퍼링크";
//
// txtResult
//
this.txtResult.Location = new System.Drawing.Point(42, 111);
this.txtResult.Multiline = true;
this.txtResult.Name = "txtResult";
this.txtResult.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;
this.txtResult.Size = new System.Drawing.Size(718, 93);
this.txtResult.TabIndex = 4;
//
// txtHyperLink
//
this.txtHyperLink.Location = new System.Drawing.Point(42, 285);
this.txtHyperLink.Multiline = true;
this.txtHyperLink.Name = "txtHyperLink";
this.txtHyperLink.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;
this.txtHyperLink.Size = new System.Drawing.Size(718, 95);
this.txtHyperLink.TabIndex = 5;
//
// txtTitle
//
this.txtTitle.Font = new System.Drawing.Font("맑은 고딕", 14.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point);
this.txtTitle.Location = new System.Drawing.Point(91, 215);
this.txtTitle.Name = "txtTitle";
this.txtTitle.Size = new System.Drawing.Size(669, 33);
this.txtTitle.TabIndex = 6;
//
// label3
//
this.label3.AutoSize = true;
this.label3.Location = new System.Drawing.Point(42, 226);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(43, 15);
this.label3.TabIndex = 7;
this.label3.Text = "타이틀";
//
// txtImg
//
this.txtImg.Location = new System.Drawing.Point(42, 417);
this.txtImg.Multiline = true;
this.txtImg.Name = "txtImg";
this.txtImg.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;
this.txtImg.Size = new System.Drawing.Size(718, 91);
this.txtImg.TabIndex = 9;
//
// label4
//
this.label4.AutoSize = true;
this.label4.Location = new System.Drawing.Point(42, 399);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(43, 15);
this.label4.TabIndex = 8;
this.label4.Text = "이미지";
//
// Form1
//
this.AutoScaleDimensions = new System.Drawing.SizeF(7F, 15F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(799, 535);
this.Controls.Add(this.txtImg);
this.Controls.Add(this.label4);
this.Controls.Add(this.label3);
this.Controls.Add(this.txtTitle);
this.Controls.Add(this.txtHyperLink);
this.Controls.Add(this.txtResult);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.txtURL);
this.Controls.Add(this.button1);
this.Name = "Form1";
this.Text = "Form1";
this.ResumeLayout(false);
this.PerformLayout();
}
#endregion
private Button button1;
private TextBox txtURL;
private Label label1;
private Label label2;
private TextBox txtResult;
private TextBox txtHyperLink;
private TextBox txtTitle;
private Label label3;
private TextBox txtImg;
private Label label4;
}
}
- 4. Form1.cs
using HtmlAgilityPack;
using System.Net;
using System.Text;
namespace WinFormsApp4
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
using (WebClient client = new WebClient()) // WebClient class inherits IDisposable
{
client.Encoding = Encoding.UTF8;
string htmlSource = client.DownloadString("http://" + txtURL.Text);
txtResult.Text = htmlSource;
Console.WriteLine(htmlSource);
Console.ReadLine();
HTMLParser parser = new HTMLParser();
parser.ParseHTML(htmlSource, this.txtHyperLink, this.txtTitle, this.txtImg);
}
}
public class HTMLParser
{
public void ParseHTML(string htmlSource, TextBox hyperLink, TextBox title, TextBox img)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(htmlSource);
title.Text= doc.DocumentNode.SelectSingleNode("//head/title").InnerText;
foreach (HtmlNode bodyNode in doc.DocumentNode.SelectNodes("//body"))
{
foreach (HtmlNode aNode in bodyNode.SelectNodes("//a"))
{
string hrefValue = aNode.Attributes["href"]?.Value;
hyperLink.Text += hrefValue + "\n";
Console.WriteLine("a : " + hrefValue);
}
foreach (HtmlNode imgNode in bodyNode.SelectNodes("//img"))
{
string src = imgNode.Attributes["src"]?.Value;
img.Text += src + "\n";
Console.WriteLine("src : " + src);
}
}
}
}
}
}
#HtmlAgilityPack, #HTMLAgility, #웹크롤러, #WebCrawler, #웹크롤링, #웹페이지파싱, #시샵동영상, #시샵교육, #닷넷교육, #닷넷학원, #시샵학원, #닷넷동영상, HtmlAgilityPack, HTMLAgility, 웹크롤러, WebCrawler, 웹크롤링, 웹페이지파싱, 시샵동영상, 시샵교육, 닷넷교육, 닷넷학원, 시샵학원, 닷넷동영상
댓글 없음:
댓글 쓰기