<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.4 20241031//EN"
        "https://jats.nlm.nih.gov/publishing/1.4/JATS-journalpublishing1-4.dtd">
<article  article-type="research-article"        dtd-version="1.4">
            <front>

                <journal-meta>
                                                                <journal-id>veri bilim derg</journal-id>
            <journal-title-group>
                                                                                    <journal-title>Veri Bilimi</journal-title>
            </journal-title-group>
                                        <issn pub-type="epub">2667-582X</issn>
                                                                                            <publisher>
                    <publisher-name>Murat GÖK</publisher-name>
                </publisher>
                    </journal-meta>
                <article-meta>
                                        <article-id/>
                                                                <article-categories>
                                            <subj-group  xml:lang="en">
                                                            <subject>Engineering</subject>
                                                    </subj-group>
                                            <subj-group  xml:lang="tr">
                                                            <subject>Mühendislik</subject>
                                                    </subj-group>
                                    </article-categories>
                                                                                                                                                        <title-group>
                                                                                                                        <article-title>Topluluk sınıflandırıcı kullanarak zararlı url tespiti</article-title>
                                                                                                                                                                                                <trans-title-group xml:lang="en">
                                    <trans-title>Malicious urls detection using ensemble classifier</trans-title>
                                </trans-title-group>
                                                                                                    </title-group>
            
                                                    <contrib-group content-type="authors">
                                                                        <contrib contrib-type="author">
                                                                <name>
                                    <surname>Köksal</surname>
                                    <given-names>Kübra</given-names>
                                </name>
                                                                    <aff>MARMARA ÜNİVERSİTESİ</aff>
                                                            </contrib>
                                                    <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0003-1062-2439</contrib-id>
                                                                <name>
                                    <surname>Doğan</surname>
                                    <given-names>Buket</given-names>
                                </name>
                                                                    <aff>Marmara Üniversitesi</aff>
                                                            </contrib>
                                                    <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0003-3875-1793</contrib-id>
                                                                <name>
                                    <surname>Altıkardeş</surname>
                                    <given-names>Zehra Aysun</given-names>
                                </name>
                                                                    <aff>Marmara Üniversitesi</aff>
                                                            </contrib>
                                                                                </contrib-group>
                        
                                        <pub-date pub-type="pub" iso-8601-date="20211230">
                    <day>12</day>
                    <month>30</month>
                    <year>2021</year>
                </pub-date>
                                        <volume>4</volume>
                                        <issue>3</issue>
                                        <fpage>113</fpage>
                                        <lpage>122</lpage>
                        
                        <history>
                                    <date date-type="received" iso-8601-date="20210706">
                        <day>07</day>
                        <month>06</month>
                        <year>2021</year>
                    </date>
                                                    <date date-type="accepted" iso-8601-date="20210807">
                        <day>08</day>
                        <month>07</month>
                        <year>2021</year>
                    </date>
                            </history>
                                        <permissions>
                    <copyright-statement>Copyright © 2018, Veri Bilimi</copyright-statement>
                    <copyright-year>2018</copyright-year>
                    <copyright-holder>Veri Bilimi</copyright-holder>
                </permissions>
            
                                                                                                <abstract><p>Teknolojinin gelişmesi ve internet kullanıcı sayısındaki artışla orantılı olarak siber suçlarda da artış gözlemlenmiştir. Birçok farklı siber saldırı tekniği bulunmaktadır. Bu saldırı tekniklerinden biri olan kötü amaçlı web siteleri, siber saldırılar ve dolandırıcılık olaylarında önemli rol oynamaktadır. İnternette masum görünen bir bağlantıya tıklamak veya e-posta ve mesaj yoluyla gönderilen bir web sayfasını ziyaret etmek arka planda sistemimizde kimlik avı kampanyalarının başlatılmasına, kötü amaçlı yazılımların, casus yazılımların, fidye yazılımların indirilmesine ve ciddi parasal kayıplar oluşmasına yol açar. Dolayısıyla bu tehditlerin etkin bir şekilde tespit edilmesi ve önlenmesi bireyler, kurumlar ve hükümetler için oldukça önemli bir konu haline gelmiştir. Kara listeye dayalı yöntemler, kötü amaçlı URL&#039;leri tanımlamak için kullanılan standart yöntemlerden biridir. Ancak kara listeler hiçbir zaman kapsamlı değildir ve yeni oluşturulan URL&#039;leri algılama yeteneğinden yoksundur. Kara listeye dayalı yöntemlerin mevcut ihtiyacı ve eksiklikleri de göz önünde bulundurularak bu çalışmada toplulukla öğrenme yöntemleri kullanılarak bir sınıflandırma yaklaşımı önerilmiştir. Çalışmada iyi huylu ve kötü huylu URL’lerden elde edilmiş 79 sözcüksel özellik içeren Kanada Siber Güvenlik Enstitüsü&#039;nün URL veriseti (ISCX-URL-2016) üzerinde çalışılmıştır. Verisetinde benign, spam, phishing, malware ve defacement olmak üzere beş farklı URL türü bulunmaktadır. Toplam 7781 iyi huylu ve 28.917 tane zararlı URL kaydı üzerinde zararlı, zararsız etiketleri kullanılarak ikili sınıflandırma işlemi ve beş farklı etiket bilgisi kullanılarak çoklu sınıflandırma işlemi gerçekleştirilmiştir. Makine öğrenmesi yöntemlerinden Rastgele Orman algoritması uygulanan yöntemin başarısının sınanması için 10-katlamalı çapraz doğrulama (10-fold cross validation) ile birlikte kullanılmıştır ve 10 temel bileşen kullanılarak ikili sınıflandırma problemi için ortalama %99.42, çoklu sınıflandırma problemi için ortalama %95.68 doğruluk değeri elde edilmiştir. Böylece sisteme her gün yenilerinin katıldığı, dinamik ortamdaki kötü niyetli tasarlanmış web sitelerinden korunmaya yönelik yüksek başarım oranına sahip bir model önerisi sunulmuştur.</p></abstract>
                                                                                                                                    <trans-abstract xml:lang="en">
                            <p>In parallel with the development of technology and the increase in the number of internet users, an increase in cybercrime has been observed. There are many different cyberattack techniques. Malicious websites, one of these attack techniques, play an important role in cyberattacks and fraud events. Clicking on an innocent-looking link on the Internet or visiting a web page sent via email or text will result in phishing campaigns being launched on our system in the background, downloading malware, spyware, ransomware, and serious monetary losses. Therefore, effective detection and prevention of these threats has become a very important issue for individuals, institutions and governments. Blacklist-based methods are one of the standard methods used to identify malicious URLs. However, blacklists are never comprehensive and lack the ability to detect newly created URLs. Considering the current needs and deficiencies of blacklist-based methods, a machine learning based classification approach was used in this study to combat malicious URLs. In the study, the URL data set of the Canadian Cyber Security Institute (ISCX-URL-2016) was studied, which contains 79 lexical features obtained from benign and malignant URLs. There are five different URL types in the dataset: benign, spam, phishing, malware and defacement. A binary classification process using harmless, malicious labels and a multi-classification process using five different labels information was performed on a total of 7781 benign, harmless and 28,917 malicious URL records. Random Forest algorithm, one of the machine learning methods, used together with 10-fold cross validation to validate the success of the applied method, and an average accuracy value of 99.42% for the binary classification problem and 95.68% for the multiple classification problem was obtained. Thus, a model proposal with a high-performance rate is presented to protect from maliciously designed websites in a dynamic environment, where new ones join the system every day.</p></trans-abstract>
                                                            
            
                                                            <kwd-group>
                                                    <kwd>Kötü niyetli URL</kwd>
                                                    <kwd>  siber güvenlik</kwd>
                                                    <kwd>  makine öğrenmesi</kwd>
                                                    <kwd>  sıradışı veri</kwd>
                                                    <kwd>  rastgele orman</kwd>
                                            </kwd-group>
                                                        
                                                                            <kwd-group xml:lang="en">
                                                    <kwd>Malicious URL</kwd>
                                                    <kwd>  cyber security</kwd>
                                                    <kwd>  machine learning</kwd>
                                                    <kwd>  outlier data</kwd>
                                                    <kwd>  random forest</kwd>
                                            </kwd-group>
                                                                                                            </article-meta>
    </front>
    <back>
                            <ref-list>
                                    <ref id="ref1">
                        <label>1</label>
                        <mixed-citation publication-type="journal">R. A. Dwan Jr., A. M. Tavares, “Predictive Analysis: Machine Learning Models for URL Classification”, Faculty of  Worcester Polytechnic Institute, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref2">
                        <label>2</label>
                        <mixed-citation publication-type="journal">D. K. McGrath and M. Gupta, &quot;Behind phishing: An examination of phisher modi operandi&quot;, Proc. LEET, pp. 4, April 2008.</mixed-citation>
                    </ref>
                                    <ref id="ref3">
                        <label>3</label>
                        <mixed-citation publication-type="journal">J. Ma, L. K. Saul, S. Savage and G. M. Voelker, &quot;Identifying suspicious URLs: An application of large-scale online learning&quot;, Proc. Int. Conf. Mach. Learn., 681-688, 2009.</mixed-citation>
                    </ref>
                                    <ref id="ref4">
                        <label>4</label>
                        <mixed-citation publication-type="journal">K. Thomas, C. Grier, J. Ma, V. Paxson and D. Song, &quot;Design and Evaluation of a Real-Time URL Spam Filtering Service,&quot; 2011 IEEE Symposium on Security and Privacy, 447-462, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref5">
                        <label>5</label>
                        <mixed-citation publication-type="journal">H Choi, B B Zhu and H. Lee, &quot;Detecting malicious web links and identifying their attack types[C]&quot;, Usenix Conference on Web Application Development, 11-11, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref6">
                        <label>6</label>
                        <mixed-citation publication-type="journal">M. Lin, C. Chiu, Y. Lee and H. Pao, &quot;Malicious URL filtering—A big data application&quot;, Proc. IEEE Int. Conf. Big Data, 589-596, 2013.</mixed-citation>
                    </ref>
                                    <ref id="ref7">
                        <label>7</label>
                        <mixed-citation publication-type="journal">W. Chu, B. B. Zhu, F. Xue, X. Guan and Z. Cai, &quot;Protect sensitive sites from phishing attacks using features extractable from inaccessible phishing urls&quot;, 2013 IEEE International Conference on Communications (ICC)., 2013, 1990-1994.</mixed-citation>
                    </ref>
                                    <ref id="ref8">
                        <label>8</label>
                        <mixed-citation publication-type="journal">M. S. I. Mamun, M. A. Rathore, A. H. Lashkari, N. Stakhanova and A. A. Ghorbani, &quot;Detecting malicious URLs using lexical analysis&quot;, Proc. Int. Conf. Netw. Syst. Secur, 467-482, 2016.</mixed-citation>
                    </ref>
                                    <ref id="ref9">
                        <label>9</label>
                        <mixed-citation publication-type="journal">A. Joshi, L. Lloyd, P. Westin and S. Seethapathy, &quot;Using lexical features for malicious url detection – a machine learning approach&quot;, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref10">
                        <label>10</label>
                        <mixed-citation publication-type="journal">A. Powell, D. Bates, C. Van Wyk, and A. Darren de Abreu, “A crosscomparison of feature selection algorithms on multiple cyber security data-sets,” Stellenbosch University, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref11">
                        <label>11</label>
                        <mixed-citation publication-type="journal">S. Singhal, U. Chawla and R. Shorey, &quot;Machine Learning &amp; Concept Drift based Approach for Malicious Website Detection&quot;, 2020 12th International Conference on Communication Systems &amp; Networks, 582-585, 2020.</mixed-citation>
                    </ref>
                                    <ref id="ref12">
                        <label>12</label>
                        <mixed-citation publication-type="journal">S. Wang, Y. Wang and M. Tang, &quot;Auto Malicious Websites Classification Based on Naive Bayes Classifier,&quot; 2020 IEEE 3rd International Conference on Information Systems and Computer Aided Education (ICISCAE), 443-447, 2020.</mixed-citation>
                    </ref>
                                    <ref id="ref13">
                        <label>13</label>
                        <mixed-citation publication-type="journal">O. K. Sahingoz, E. Buber, O. Demir and B. Diri, &quot;Machine learning based phishing detection from urls&quot;, Expert Syst. Appl., vol. 117, pp. 345-357, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref14">
                        <label>14</label>
                        <mixed-citation publication-type="journal">Canadian Instıtute for CyberSecurity, “URL Dataset (ISCX-URL-2016)”, https://www.unb.ca/cic/datasets/url-2016.html, 2016, (5 Mayıs 2020).</mixed-citation>
                    </ref>
                                    <ref id="ref15">
                        <label>15</label>
                        <mixed-citation publication-type="journal">Principal Component Analysis Tutorial,, https://www.dezyre.com/data-science-in-python-tutorial/principal-component-analysis-tutorial, (20 Haziran 2020).</mixed-citation>
                    </ref>
                                    <ref id="ref16">
                        <label>16</label>
                        <mixed-citation publication-type="journal">Y. Sun, H. Zhang, T. Zhao, Z. Zou, B. Shen and L. Yang, &quot;A new convolutional neural network with random forest method for hydrogen sensor fault diagnosis&quot;, IEEE Access, vol. 8, 85421-85430, 2020</mixed-citation>
                    </ref>
                            </ref-list>
                    </back>
    </article>
