#

 

Aufgabe:

Die Anwendung durchforstet einen kompletten Bereich von Ebay Kleinanzeigen.

Die Anwendung ist in C#, WPF geschrieben.

 

Basis:

Ebay Kleinanzeigen filtert die Ergebnisse nicht nach Fulltext Eingaben, sondern es werden nur die Ergebnisse angezeigt,

welche die Suchbegriffe im Titel oder in den Stichworten haben.

Dadurch ist eine effektive Suche sinnlos.

Man kann zum Beispiel nicht nach XTR Schaltungen suchen oder nach dem Begriff NP für Neupreis in  der Anzeige

 

SQL Programm unter Windows 10

 

 

SQL Datenbank

Dabei werden die Werte in eine lokale Datenbank gespeichert

Ein sammeln von mehreren tausend Datensätzen ist hier kein Problem.

 

 

 

C# Code Listing

 

 

 

 

Using Namespaces

using System;

using System.Collections.Generic;

using System.Text;

using System.Threading.Tasks;

using System.Windows;

//< add using >

using System.IO;                //*Stream, Streamreader

using System.Net;               //*NetRequest

using HtmlAgilityPack;          //*Html Website

using System.Data.SqlClient;    //*local DB

using System.Data;              //*ConnectionState, DataTable

using System.Windows.Threading;

//</ add using >

 

 

 

 

namespace WebRobot_Kleinanzeigen

{

 

    public partial class MainWindow : Window

    {

 

 

 

 

 

 

        #region Form

        //< refresh delegate >

        private delegate void EmptyDelegate();

        //</ refresh delegate >

 

        //--------------------< region: Form >---------------------

        public MainWindow()

        {

            InitializeComponent();

            optStop.IsChecked = false;

        }

        //--------------------</ region: Form >---------------------

        #endregion /Form

 

 

Region Buttons

 

        #region Buttons

        //--------------------< region: Buttons >---------------------

        private void Button_Start_Click(object sender, RoutedEventArgs e)

        {

            //--------< Button_Start_Click() >--------

            optStop.IsChecked = false;

            Scrape_all_Lists();

            //--------</ Button_Start_Click() >--------

        }

 

 

        private void Button_Details_Click(object sender, RoutedEventArgs e)

        {

            //--------< Button_Details_Click() >--------

            optStop.IsChecked = false;

            Scrape_all_Details();

            //--------</ Button_Details_Click() >--------

        }

        //--------------------</ region: Buttons >---------------------

        #endregion /Buttons

 

 

Liste sammeln

Bei webrobotern muss man immer erst eine Liste erfassen und anschliessend die Details ausarbeiten.

Hier das Sammeln Daten aus der Liste

#region Methods Scan

        //--------------------< region: Methods Scan >---------------------

        private void Scrape_all_Lists()

        {

            //--------< Scrape_all_Lists() >--------

            fx_Log("----< Read Lists >----");

            Textbox_URL.Text = "https://www.ebay-kleinanzeigen.de/s-fahrraeder/damen/c217+fahrraeder.art_s:damen";

            while (Textbox_URL.Text != "")

            {

                if (optStop.IsChecked == true) return;

                Scrape_List();

            }

            fx_Log("----</ Read Lists >----");

            //--------</ Scrape_all_Lists() >--------

        }

 

        private void Scrape_List()

        {

            //--------< Scrape_List() >--------

            //*freiberuflich https://www.ebay-kleinanzeigen.de/s-fahrraeder/damen/c217+fahrraeder.art_s:damen

            //*weiter mit span class=next /de/jobs/temporary/?job_types=temporary&page=2

            string sURL = Textbox_URL.Text;

            Uri baseUrl = new Uri(sURL);

 

            if (optStop.IsChecked == true) return;

 

            string sPage = baseUrl.Query;

            fx_Log("--< " + sPage + " >--");

 

            HtmlDocument doc = Web_Get_HtmlDocument(sURL);

            if (doc == null) return;

 

            //< node01 >

            HtmlNode node_Area = doc.DocumentNode.SelectSingleNode("//ul[@id=\"srchrslt-adtable\"]"); //*find subnode with

            //</ node01 >

            if (node_Area != null)

            {

                //< nodes >

                HtmlNodeCollection nodes = node_Area.SelectNodes("//h2[@class=\"text-module-begin\"]"); //*find subnode with

                //< nodes >

 

                //------< @Loop: Detail Nodes >------

                foreach (HtmlNode node in nodes)

                {

                    //----< In Detail-Node >----

                    if (optStop.IsChecked == true) return;

 

                    //--< get a_href >--

                    HtmlNode node_to_Detail = node.SelectSingleNode("a");

                    string sTitle = node_to_Detail.InnerText;

                    string sURL_Detail_relative = node_to_Detail.GetAttributeValue("href", "");

                    string sURL_Detail_absolute = new Uri(baseUrl, sURL_Detail_relative).AbsoluteUri;

 

                    db_Update_Add_ListRecord(sURL_Detail_absolute, sTitle);

 

                    //< print >

                    fx_Log(sTitle);

                    //</ print >

                    //--</ get a_href >--

                    //----</ In Detail-Node >----

                }

                //------</ @Loop: Detail Nodes >------

 

                fx_Log("--</ " + sPage + " >--");

                fx_Log("");

 

                HtmlNode node_Next = doc.DocumentNode.SelectSingleNode("//a[@class=\"pagination-next\"]");

                if (node_Next != null)

                {

                    string sURL_Next_relative = node_Next.GetAttributeValue("href", "");

                    string sURL_Next = new Uri(baseUrl, sURL_Next_relative).AbsoluteUri;

                    Textbox_URL.Text = sURL_Next;

                }

                else

                {

                    Textbox_URL.Text = "";

                }                                                                                              //</ nodes >

            }

 

 

            //--------</ Scrape_List() >--------

        }

 

 

Auswerten aller Details

Anschliessend warden alle einzelnen Webseiten aufgerufen und die Texte gesammelt

#region Methods Scan

        ..

private void Scrape_all_Details()

        {

            //--------< Scrape_all_Details() >--------

            fx_Log("----< @Read Details >----");

 

            //----< @Loop: Alle Empty Records >----

            while (1 == 1)

            {

                //----< Detail >----

                if (optStop.IsChecked == true) return;

 

                //< find record >

                string sSQL = "SELECT TOP 1 [IDDetail] FROM tbl_Details WHERE [dtScan] IS NULL";

                DataRow row = clsDB.Get_DataRow(sSQL);

                //</ find record >

 

                if (row != null)

                {

                    Scrape_Detail(Convert.ToInt32(row["IDDetail"]));

                }

                else

                {

                    break;

                }

                //----</ Detail >----

            }

            //----</ @Loop: Alle Empty Records >----

 

            fx_Log("----</ @Read Details >----");

            //--------</ Scrape_all_Details() >--------

        }

 

        private void Scrape_Detail(int ID)

        {

            //--------< Scrape_Detail() >--------

            if (optStop.IsChecked == true) return;

 

            //fx_Log("--< Read Detail >--");

            fx_Log("Detail=" + ID);

 

            string sURL = clsDB.Get_Value_as_String("URL", "tbl_Details", "[IDDetail]=" + ID);

            HtmlDocument doc = Web_Get_HtmlDocument(sURL);

            //< check >

            if (doc == null)

            {

                //< update >

                string sql_Error = "UPDATE tbl_Details SET [dtScan] = SYSDATETIME() WHERE IDDetail = " + ID;

                clsDB.Execute_SQL(sql_Error);

                //</ update >

                return;

            }

            //</ check >

 

            //----< In Detail-Node >----

            //< Text >

            string sText = "";

            HtmlNode nodeText = doc.DocumentNode.SelectSingleNode("//p[@id=\"viewad-description-text\"]");

            if (nodeText != null) {

                sText = nodeText.InnerText;

                sText = sText.Trim();

            }

            //</ Text >

 

            //--< Check deleted >--

            if (sText=="")

            {

                HtmlNode nodeWarning = doc.DocumentNode.SelectSingleNode("//div[@class=\"outcomemessage-warning\"]"); //class outcomemessage-warning, ID=srchrslt-adexpired

                if (nodeWarning == null )

                {

                    return; //abbruch ohne Text

                }

                else

                {

                    //expired, deleted

                    //< delete >

                    string sql_Error = "DELETE FROM tbl_Details WHERE IDDetail = " + ID;

                    clsDB.Execute_SQL(sql_Error);

                    //</ delete >

                    return;

                }

            }

            //--</ Check deleted >--

 

            if (sText.Length>50)

            {

                fx_Log("text=" + sText.Substring(0, 50));

            }

            else

            {

                fx_Log("text=" + sText);

            }

           

 

            //< Preis >

            string sPreis = "";

            int intPreis = 0;

            HtmlNode nodePreis = doc.DocumentNode.SelectSingleNode("//h2[@id=\"viewad-price\"]");

            if (nodePreis != null)

            {

                sPreis = nodePreis.InnerText;

                sPreis = sPreis.Replace("Preis:","");

                sPreis = sPreis.Replace("€", "");

                sPreis = sPreis.Replace("VB", "");

                sPreis = sPreis.Replace(".", "");

                sPreis = sPreis.Replace("Zu verschenken", "");

                sPreis = sPreis.Trim();

               

                if(sPreis=="")

                {

                    intPreis = 0;

                }

                else

                {

                    intPreis = Convert.ToInt32(sPreis);

                }

               

            }

            //</ Preis >

 

            //--< Special >--

            string sErstellungsdatum = html_GetText_Kleinanzeigen_Info_Text(doc, "Erstellungsdatum:");  //19.02.2018

            string sSchlagwoerter = html_GetText_Kleinanzeigen_Info_Links (doc, "Schlagwörter:");

            string sArt = html_GetText_Kleinanzeigen_Info_Links(doc, "Art:");

            //--</ Special >--

 

            //< correct >

            sText = clsCheck.correct_String(sText);

            sErstellungsdatum = clsCheck.correct_String(sErstellungsdatum );

            sSchlagwoerter  = clsCheck.correct_String(sSchlagwoerter );

            sArt = clsCheck.correct_String(sArt);

            //</ correct >

 

            //Date to Iso

            //*yyyy-mm-dd

            string sDate_ISO = sErstellungsdatum.Substring(6, 4) + "-" + sErstellungsdatum.Substring(3, 2) + "-" + sErstellungsdatum.Substring(0, 2);

 

            //< update >

            string sql_Update = "UPDATE tbl_Details ";

            sql_Update += Environment.NewLine + " SET [Text]='" + sText + "',    [dtScan] = SYSDATETIME()";

            sql_Update += Environment.NewLine + " ,[Schlagwoerter]='" + sSchlagwoerter  + "',[Art]='" + sArt + "'";

            sql_Update += Environment.NewLine + " ,[Erstellungsdatum]=CAST('" + sDate_ISO + "' AS DATETIME)";

            sql_Update += Environment.NewLine + " ,[Preis]=" + intPreis ;

            //in: 19.02.2018

            //out: CAST('2009 - 05 - 25' AS DATETIME)'

            sql_Update += " WHERE IDDetail = " + ID;

            clsDB.Execute_SQL(sql_Update);

            //</ update >

 

            //----</ In Detail-Node >----

 

 

            //----< Images >----

 

            //--< Image-Area >--

            HtmlNode node_Image_Area = doc.DocumentNode.SelectSingleNode("//div[@id=\"viewad-images\"]");

            if (node_Image_Area != null)

            {

 

                HtmlNodeCollection image_Nodes = node_Image_Area.SelectNodes("//img");

                foreach(HtmlNode imgNode in image_Nodes )

                {

                    string sImageURL = imgNode.GetAttributeValue("src", "");

                    if (sImageURL.Contains("/common/")==false )

                        {

                        //sImageURL = sImageURL.Replace("/", "\/");

 

                        //< add >

                        string sql_Add = "INSERT INTO tbl_Images ([IDDetail],[URL]) VALUES(" + ID + " , '" + sImageURL + "')";

                        clsDB.Execute_SQL(sql_Add);

                            //</ add >

                    }

                }

 

 

            }

            //--</ Image-Area >--

 

            //----</ Images >----

 

 

            //fx_Log("--</ Read Detail >--");

            fx_Log("/Detail=" + ID);

            //--------</ Scrape_Detail() >--------

        }

        //--------------------< region: Methods Scan >---------------------

        #endregion Methods Scan

 

 

 

Größere Aufgaben werden in Methoden zusammengefasst.

 

HTML Dokument erstellen aus Web-URL

Texte aus bestimmten Bereichen ermitteln

Links sammeln

 

      #region Methods HTML

        //--------------------< region: Methods >---------------------

        private HtmlAgilityPack.HtmlDocument Web_Get_HtmlDocument(string sURL)

        {

            //------------< fx_read_Page() >------------

            //* get the HTML Document of a website-URL     

            try

            {

                //-< init >-

                //< WebRequest and Response >

                WebRequest objRequest = WebRequest.Create(sURL);

                HttpWebResponse objResponse = (HttpWebResponse)objRequest.GetResponse();

                //</ WebRequest and Response >

 

                //< Stream and Reader >

                Stream objDataStream = objResponse.GetResponseStream();

                StreamReader TextReader = new StreamReader(objDataStream);

                //</ Stream and Reader >

                //-</ init >-

 

                //< download >

                //* Read Website to local String

                string sHTML = TextReader.ReadToEnd();

                //</ download >

 

                //< get HTMLdocument >

                //*create and load to local HtmlDocument

                HtmlDocument doc = new HtmlDocument();

                doc.LoadHtml(sHTML);

                //</ get HTMLdocument >

 

                //< output >

                return doc;

                //</ output >

            }

            catch (Exception)

            {

                return null;

            }

 

            //------------</ fx_read_Page() >------------

        }

 

        private string html_GetText_by_Class_Filter(HtmlDocument doc, string par_Html_Element_Type,string par_Class_ID, string parFilter)

        {

            string sReturn = "";

            //< Text >

            HtmlNode nodeText = doc.DocumentNode.SelectSingleNode("//" + par_Html_Element_Type + "[@" + par_Class_ID + "=\"" + parFilter + "\"]");

            if (nodeText != null) { sReturn = nodeText.InnerText; }

            //</ Text >

 

 

            sReturn.Trim();

            sReturn.TrimStart();

            return sReturn;

        }

 

        private HtmlNode  html_GetNode_by_Class_Filter_Innertext(HtmlDocument doc, string par_Html_Element_Type, string par_Class_ID, string parFilter,string innerText_Filter)

        {

            HtmlNode return_Node = null;

            //< Text >

            HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//" + par_Html_Element_Type + "[@" + par_Class_ID + "=\"" + parFilter + "\"]");

            if (nodes != null)

            {

                foreach (HtmlNode node in nodes)

                {

                    if (node.InnerText == innerText_Filter)

                    {

                        return_Node = node;

                        break;

                    }

                   

                }

            }

            //</ Text >

            return return_Node;

        }

 

        private string html_GetText_Kleinanzeigen_Info_Links(HtmlDocument doc,  string sFilter_Inner)

        {

            string sReturn = "";

            HtmlNode node_Titel = html_GetNode_by_Class_Filter_Innertext(doc, "dt", "class", "attributelist--key", sFilter_Inner);

            if (node_Titel != null)

            {

                HtmlNode node0 = node_Titel.NextSibling;

                if (node0 != null)

                {

                    HtmlNode node = node0.NextSibling;

                    if (node != null)

                    {

                        HtmlNodeCollection subLinks = node.SelectNodes(".//a");

                         foreach (HtmlNode subLink in subLinks)

                        {

                            string sTreffer  =subLink.InnerText;

                            sTreffer = sTreffer.Replace("\n","") ;

                            sTreffer=sTreffer.Trim();

                            sReturn += " " + sTreffer;

                        }

                       

                    }

                }

            }

            sReturn = sReturn.Trim();

            return sReturn;

        }

 

        private string html_GetText_Kleinanzeigen_Info_Text(HtmlDocument doc, string sFilter_Inner)

        {

            string sReturn = "";

            HtmlNode node_Titel = html_GetNode_by_Class_Filter_Innertext(doc, "dt", "class", "attributelist--key", sFilter_Inner);

            if (node_Titel != null)

            {

                HtmlNode node0 = node_Titel.NextSibling;

                if (node0 != null)

                {

                    HtmlNode node = node0.NextSibling;

                    if (node != null)

                    {

                        sReturn += node.InnerText;                       

                    }

                }

            }

            return sReturn;

        }

 

        private string html_GetText_OuterGroup_by_InnerFilter(HtmlDocument doc, string par_Html_Element_Type, string sFilter_Inner)

        {

            string sReturn = "";

 

            HtmlNode node_Inner = doc.DocumentNode.SelectSingleNode("//" + par_Html_Element_Type + "[text() = '" + sFilter_Inner + "']");

            if (node_Inner != null)

            {

                HtmlNode parent = node_Inner.ParentNode;

                node_Inner.Remove();

                sReturn = parent.InnerText;

                sReturn.Trim();

                sReturn.TrimStart();

            }

            return sReturn;

        }

        //--------------------</ region: Methods >---------------------

        #endregion /Methods Html

 

 

 

        #region Methods Data

        //--------------------< region: Methods Data >---------------------

        private void db_Update_Add_ListRecord(string sURL, string sTitle)

        {

            //--------< db_Update_Add_Record() >--------

            //*Update or add Record

            //< correct>

            sURL = sURL.Replace("'", "''");

            sTitle = sTitle.Replace("'", "''");

            //</ correct>

 

            //< find record >

            string sSQL = "SELECT TOP 1 * FROM tbl_Details WHERE [URL] Like '" + sURL + "'";

            DataTable tbl = clsDB.Get_DataTable(sSQL);

            //</ find record >

 

            if (tbl.Rows.Count == 0)

            {

                //< add >

                string sql_Add = "INSERT INTO tbl_Details ([URL],[Title],[dtList]) VALUES('" + sURL + "','" + sTitle + "',SYSDATETIME())";

                clsDB.Execute_SQL(sql_Add);

                //</ add >

            }

            else

            {

                //< update >

                string ID = tbl.Rows[0]["IDDetail"].ToString();

                string sql_Update = "UPDATE tbl_Details SET [dtList] = SYSDATETIME() WHERE IDDetail = " + ID;

                clsDB.Execute_SQL(sql_Update);

                //</ update >

            }

            //--------</ db_Update_Add_Record() >--------

        }

 

        //--------------------</ region: Methods Data >---------------------

        #endregion /Methods Data

 

        #region Sys

        //--------------------< region: Sys >---------------------

        private void fx_Log(string sLog)

        {

            //------------< fx_Log() >------------

            //* log Text to Textbox

            string sText = Textbox_Log.Text;

            sText = DateTime.Now + " " + sLog + Environment.NewLine + sText;

            if (sText.Length > 50000) { sText = sText.Substring(50000); }

            Textbox_Log.Text = sText;

            Textbox_Log.UpdateLayout();

            //< refresh >

            DoEvents();

            //UpdateLayout();

            //</ refresh >

            //------------</ fx_Log() >------------

        }

 

        protected void DoEvents()

        {

            //----< DoEvents() >----

            //* Diese Funktion uebernimmt die Unterbrechnung zur Anzeige und Eventbearbeitung in C#, WPF beim langen Loop Berechnungen

            //* mit einer Dispatcher

            //* EmptyDelegate im Header definieren

            //* using System.Windows.Threading; im Header festlegen

 

            Dispatcher.CurrentDispatcher.Invoke(DispatcherPriority.Background, new EmptyDelegate(delegate { }));

            //----</ DoEvents() >----

        }

        //--------------------</ region: Sys >---------------------

        #endregion /Sys

 

 

<Window x:Class="WebRobot_Kleinanzeigen.MainWindow"

        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"

        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"

        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"

        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"

        xmlns:local="clr-namespace:WebRobot_Kleinanzeigen"

        mc:Ignorable="d"

        Title="MainWindow" Height="550" Width="720">

    <Grid>

        <Button x:Name="Button_Start" Content="Start" HorizontalAlignment="Left" Margin="29,10,0,0" VerticalAlignment="Top" Width="75"

                Click="Button_Start_Click"                >

        </Button>

        <Label x:Name="Label_Button_Start" Content="read Lists" HorizontalAlignment="Left" VerticalAlignment="Top" Margin="117,11,0,0"/>

        <Button x:Name="Button_Details" Content="Details" HorizontalAlignment="Left" Margin="30,34,0,0" VerticalAlignment="Top" Width="75"

                Click="Button_Details_Click"

                />

        <Label x:Name="Label_Button_Details" Content="read all Details" HorizontalAlignment="Left" VerticalAlignment="Top" Margin="117,31,0,0"/>

        <RadioButton x:Name="optStop" IsChecked="False"  Content="Stop" HorizontalAlignment="Left" Margin="335,22,0,0" VerticalAlignment="Top"/>

 

        <Label x:Name="Label_URL" Content="Url:" HorizontalAlignment="Left" Margin="30,46,0,0" VerticalAlignment="Top"/>

        <TextBox x:Name="Textbox_URL" HorizontalAlignment="Left" Height="23" Margin="30,72,0,0" TextWrapping="Wrap"

                 Text="" VerticalAlignment="Top" Width="658"/>

 

        <Label x:Name="Label_Log" Content="log:" HorizontalAlignment="Left" Margin="29,100,0,0" VerticalAlignment="Top"/>

        <TextBox x:Name="Textbox_Log" HorizontalAlignment="Left" Height="385" Margin="29,126,0,0" TextWrapping="Wrap" Text=".." VerticalAlignment="Top" Width="659"/>

       

 

 

    </Grid>

</Window>

 

 

 

Hinterlegte Datenbank

CREATE TABLE [dbo].[tbl_Details] (

    [IDDetail]         INT            IDENTITY (1, 1) NOT NULL,

    [URL]              NVARCHAR (255) NULL,

    [Title]            NVARCHAR (255) NULL,

    [Text]             NVARCHAR (MAX) NULL,

    [dtScan]           DATETIME       NULL,

    [Schlagwoerter]    NVARCHAR (255) NULL,

    [Art]              NVARCHAR (255) NULL,

    [Erstellungsdatum] DATETIME       NULL,

    [Preis]            INT            NULL,

    [dtList]           DATETIME       NULL,

    PRIMARY KEY CLUSTERED ([IDDetail] ASC)

);

 

 

 

 

Und für Bilder

CREATE TABLE [dbo].[tbl_Images] (

    [IDImage]  INT            IDENTITY (1, 1) NOT NULL,

    [IDDetail] INT            NULL,

    [URL]      NVARCHAR (255) NULL,

    PRIMARY KEY CLUSTERED ([IDImage] ASC)

);

 

 

Mobile

.

soap2day