레이블이 HTMLAgility인 게시물을 표시합니다. 모든 게시물 표시
레이블이 HTMLAgility인 게시물을 표시합니다. 모든 게시물 표시

2022년 2월 12일 토요일

(C#닷넷동영상)C# HtmlAgilityPack을 이용한 웹크롤러(Web-Crawler) 웹페이지, HTML 파싱하기, 닷넷학원, C#학원, WPF학원,자바학원, JAVA동영상

 (C#닷넷동영상)C# HtmlAgilityPack을 이용한 웹크롤러(Web-Crawler) 웹페이지, HTML 파싱하기, 닷넷학원, C#학원, WPF학원,자바학원, JAVA동영상


http://ojc.asia/bbs/board.php?bo_table=LecCsharpNet&wr_id=57 


(동영상)C# HtmlAgilityPack을 이용한 웹크롤러(Web-Crawler) 웹페이지, HTML 파싱하기

(동영상)C# HtmlAgilityPack을 이용한 웹크롤러(Web-Crawler) 웹페이지, HTML 파싱하기HtmlAgilityPack을 이용한 웹크롤러웹페이지, HTML 파싱하기HtmlAgility, WebClient를 이용한 웹크롤러 만들기(Web-Crawler)HtmlAgilityPa

ojc.asia

https://www.youtube.com/watch?v=wNo2hF0uY4U&list=PLxU-iZCqT52CA9Y474h7UbqmWqXwIZ-hl&index=6 

https://www.youtube.com/watch?v=ybhnGNhhLXU&list=PLxU-iZCqT52CA9Y474h7UbqmWqXwIZ-hl&index=5 



HtmlAgilityPack을 이용한 웹크롤러

웹페이지, HTML 파싱하기







HtmlAgility, WebClient를 이용한 웹크롤러 만들기(Web-Crawler)


  • HtmlAgilityPack은 NET Framework의 코드 만으로 HTML 문서를 파싱하고 분석할 수 있는 도구로 닷넷 프로젝트에서 사용하려면 NuGet 패키지 관리에서 간단히 설치해서 사용하면 된다. 


  • System.Xml 네임스페이스에서 제공하는 XPATH를 지원하고 HTML 문서탐색을 편하게 해주고 HTML을 다운받거나 또는 HMLT을 파싱하는 방법등을 이용할 수  있다.


  • HTML 노드를 탐색하기 위해서는 XPath문법을 알아야 한다. 단일노드(SelectSingleNode) 및 여러노드(SelectNode) 탐색이 가능하며 여러노드(SelectNode)를 탐색한 경우 foreach문의 반복문에서 나머지 처리를 하면 된다.


WebClient client = new WebClient();                    
string htmlSource = client.DownloadString("http://naver.com”);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(htmlSource);   

// a 태그를 순회하면서 href 속성을 읽음
HtmlNode bodyNode in doc.DocumentNode.SelectSingleNodes("//body")
foreach (HtmlNode aNode in bodyNode.SelectNodes("//a"))
{
       string hrefValue = aNode.Attributes["href"].Value;
       Console.WriteLine("a href: " + hrefValue);
}

//웹문서의 타이틀을 읽음
title.Text= doc.DocumentNode.SelectSingleNode("//head/title").InnerText;  //HTML의 타이틀읽음


  • 윈폼에서 URL을 입력받아서 전체 html 소스를 받고 html 소스중 <title> 태그를 파싱하여 타이틀을 출력하고 <a href >, <img src> 태그를 파싱하여 출력하는 예제를 만들어 보자.


  • 실행화면




  • 1. C#, 윈폼으로 WinFormsApp4이라는 이름의 프로젝트를 생성한다.
  • 2. 프로젝트 우측 마우스 클릭 >> NuGet 패키지 관리에서 "HtmlAgility"로 검색하여 HtmlAgilityPack을 설치하자.
  • 3. Form1.Designer.cs


namespace WinFormsApp4

{

    partial class Form1

    {

        /// <summary>

        ///  Required designer variable.

        /// </summary>

        private System.ComponentModel.IContainer components = null;


        /// <summary>

        ///  Clean up any resources being used.

        /// </summary>

        /// <param name="disposing">true if managed resources should be disposed; otherwise, false.</param>

        protected override void Dispose(bool disposing)

        {

            if (disposing && (components != null))

            {

                components.Dispose();

            }

            base.Dispose(disposing);

        }


        #region Windows Form Designer generated code


        /// <summary>

        ///  Required method for Designer support - do not modify

        ///  the contents of this method with the code editor.

        /// </summary>

        private void InitializeComponent()

        {

            this.button1 = new System.Windows.Forms.Button();

            this.txtURL = new System.Windows.Forms.TextBox();

            this.label1 = new System.Windows.Forms.Label();

            this.label2 = new System.Windows.Forms.Label();

            this.txtResult = new System.Windows.Forms.TextBox();

            this.txtHyperLink = new System.Windows.Forms.TextBox();


            this.txtTitle = new System.Windows.Forms.TextBox();

            this.label3 = new System.Windows.Forms.Label();

            this.txtImg = new System.Windows.Forms.TextBox();

            this.label4 = new System.Windows.Forms.Label();

            this.SuspendLayout();

            // 

            // button1

            // 

            this.button1.Location = new System.Drawing.Point(42, 37);

            this.button1.Name = "button1";

            this.button1.Size = new System.Drawing.Size(210, 33);

            this.button1.TabIndex = 0;

            this.button1.Text = "웹크롤링";

            this.button1.UseVisualStyleBackColor = true;

            this.button1.Click += new System.EventHandler(this.button1_Click);

            // 

            // txtURL

            // 

            this.txtURL.Font = new System.Drawing.Font("맑은 고딕", 14.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point);

            this.txtURL.Location = new System.Drawing.Point(287, 35);

            this.txtURL.Name = "txtURL";

            this.txtURL.Size = new System.Drawing.Size(473, 33);

            this.txtURL.TabIndex = 1;

            // 

            // label1

            // 

            this.label1.AutoSize = true;

            this.label1.Location = new System.Drawing.Point(49, 93);

            this.label1.Name = "label1";

            this.label1.Size = new System.Drawing.Size(95, 15);

            this.label1.TabIndex = 2;

            this.label1.Text = "전체 HTML 소스";

            // 

            // label2

            // 

            this.label2.AutoSize = true;

            this.label2.Location = new System.Drawing.Point(42, 267);

            this.label2.Name = "label2";

            this.label2.Size = new System.Drawing.Size(67, 15);

            this.label2.TabIndex = 3;

            this.label2.Text = "하이퍼링크";

            // 

            // txtResult

            // 

            this.txtResult.Location = new System.Drawing.Point(42, 111);

            this.txtResult.Multiline = true;

            this.txtResult.Name = "txtResult";

            this.txtResult.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;

            this.txtResult.Size = new System.Drawing.Size(718, 93);

            this.txtResult.TabIndex = 4;

            // 

            // txtHyperLink

            // 

            this.txtHyperLink.Location = new System.Drawing.Point(42, 285);

            this.txtHyperLink.Multiline = true;

            this.txtHyperLink.Name = "txtHyperLink";

            this.txtHyperLink.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;

            this.txtHyperLink.Size = new System.Drawing.Size(718, 95);

            this.txtHyperLink.TabIndex = 5;

            // 

            // txtTitle

            // 

            this.txtTitle.Font = new System.Drawing.Font("맑은 고딕", 14.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point);

            this.txtTitle.Location = new System.Drawing.Point(91, 215);

            this.txtTitle.Name = "txtTitle";

            this.txtTitle.Size = new System.Drawing.Size(669, 33);

            this.txtTitle.TabIndex = 6;

            // 

            // label3

            // 

            this.label3.AutoSize = true;

            this.label3.Location = new System.Drawing.Point(42, 226);

            this.label3.Name = "label3";

            this.label3.Size = new System.Drawing.Size(43, 15);

            this.label3.TabIndex = 7;

            this.label3.Text = "타이틀";

            // 

            // txtImg

            // 

            this.txtImg.Location = new System.Drawing.Point(42, 417);

            this.txtImg.Multiline = true;

            this.txtImg.Name = "txtImg";

            this.txtImg.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;

            this.txtImg.Size = new System.Drawing.Size(718, 91);

            this.txtImg.TabIndex = 9;

            // 

            // label4

            // 

            this.label4.AutoSize = true;

            this.label4.Location = new System.Drawing.Point(42, 399);

            this.label4.Name = "label4";

            this.label4.Size = new System.Drawing.Size(43, 15);

            this.label4.TabIndex = 8;

            this.label4.Text = "이미지";

            // 

            // Form1

            // 

            this.AutoScaleDimensions = new System.Drawing.SizeF(7F, 15F);

            this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;


            this.ClientSize = new System.Drawing.Size(799, 535);

            this.Controls.Add(this.txtImg);

            this.Controls.Add(this.label4);

            this.Controls.Add(this.label3);

            this.Controls.Add(this.txtTitle);

            this.Controls.Add(this.txtHyperLink);

            this.Controls.Add(this.txtResult);

            this.Controls.Add(this.label2);

            this.Controls.Add(this.label1);

            this.Controls.Add(this.txtURL);

            this.Controls.Add(this.button1);

            this.Name = "Form1";

            this.Text = "Form1";

            this.ResumeLayout(false);

            this.PerformLayout();


        }


        #endregion


        private Button button1;

        private TextBox txtURL;

        private Label label1;

        private Label label2;

        private TextBox txtResult;

        private TextBox txtHyperLink;

        private TextBox txtTitle;

        private Label label3;

        private TextBox txtImg;

        private Label label4;

    }

}



  • 4. Form1.cs


using HtmlAgilityPack;

using System.Net;

using System.Text;


namespace WinFormsApp4

{

    public partial class Form1 : Form

    {

        public Form1()

        {

            InitializeComponent();

        }


        private void button1_Click(object sender, EventArgs e)

        {

            using (WebClient client = new WebClient()) // WebClient class inherits IDisposable

            {

                client.Encoding = Encoding.UTF8;

                string htmlSource = client.DownloadString("http://" + txtURL.Text);

                txtResult.Text = htmlSource;

                Console.WriteLine(htmlSource);

                Console.ReadLine();

                HTMLParser parser = new HTMLParser();

                parser.ParseHTML(htmlSource, this.txtHyperLink, this.txtTitle, this.txtImg);

            }

        }


        public class HTMLParser

        {

            public void ParseHTML(string htmlSource, TextBox hyperLink, TextBox title, TextBox img)

            {

                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

                doc.LoadHtml(htmlSource);


                title.Text= doc.DocumentNode.SelectSingleNode("//head/title").InnerText;


                foreach (HtmlNode bodyNode in doc.DocumentNode.SelectNodes("//body"))

                {

                    foreach (HtmlNode aNode in bodyNode.SelectNodes("//a"))

                    {

                        string hrefValue = aNode.Attributes["href"]?.Value;

                        hyperLink.Text += hrefValue + "\n";

                        Console.WriteLine("a : " + hrefValue);

                    }


                    foreach (HtmlNode imgNode in bodyNode.SelectNodes("//img"))

                    {

                        string src = imgNode.Attributes["src"]?.Value;

                        img.Text += src + "\n";

                        Console.WriteLine("src : " + src);

                    }

                }

            }

        }

    }

}




#HtmlAgilityPack, #HTMLAgility, #웹크롤러, #WebCrawler, #웹크롤링, #웹페이지파싱, #시샵동영상, #시샵교육, #닷넷교육, #닷넷학원, #시샵학원, #닷넷동영상, HtmlAgilityPack, HTMLAgility, 웹크롤러, WebCrawler, 웹크롤링, 웹페이지파싱, 시샵동영상, 시샵교육, 닷넷교육, 닷넷학원, 시샵학원, 닷넷동영상

(C#교육동영상)C# ADO.NET 실습 ODP.NET/ODAC 설치 오라클 함수 호출 실습, C#학원, WPF학원, 닷넷학원, 자바학원

  (C#교육동영상)C# ADO.NET 실습  ODP.NET/ODAC 설치  오라클 함수 호출 실습, C#학원, WPF학원, 닷넷학원, 자바학원 https://www.youtube.com/watch?v=qIPU85yAlzc&list=PLxU-i...