<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.4 20241031//EN"
        "https://jats.nlm.nih.gov/publishing/1.4/JATS-journalpublishing1-4.dtd">
<article  article-type="research-article"        dtd-version="1.4">
            <front>

                <journal-meta>
                                                                <journal-id>saucis</journal-id>
            <journal-title-group>
                                                                                    <journal-title>Sakarya University Journal of Computer and Information Sciences</journal-title>
            </journal-title-group>
                                        <issn pub-type="epub">2636-8129</issn>
                                                                                            <publisher>
                    <publisher-name>Sakarya University</publisher-name>
                </publisher>
                    </journal-meta>
                <article-meta>
                                        <article-id pub-id-type="doi">10.35377/saucis...1139765</article-id>
                                                                <article-categories>
                                            <subj-group  xml:lang="en">
                                                            <subject>Artificial Intelligence</subject>
                                                    </subj-group>
                                            <subj-group  xml:lang="tr">
                                                            <subject>Yapay Zeka</subject>
                                                    </subj-group>
                                    </article-categories>
                                                                                                                                                        <title-group>
                                                                                                                                                            <article-title>An Approach for Audio-Visual Content Understanding of Video using Multimodal Deep Learning Methodology</article-title>
                                                                                                    </title-group>
            
                                                    <contrib-group content-type="authors">
                                                                        <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0002-5925-5759</contrib-id>
                                                                <name>
                                    <surname>Boztepe</surname>
                                    <given-names>Emre Beray</given-names>
                                </name>
                                                                    <aff>ÇANAKKALE ONSEKİZ MART ÜNİVERSİTESİ, MÜHENDİSLİK FAKÜLTESİ, BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ, BİLGİSAYAR MÜHENDİSLİĞİ PR.</aff>
                                                            </contrib>
                                                    <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0002-7255-9263</contrib-id>
                                                                <name>
                                    <surname>Karakaya</surname>
                                    <given-names>Bedirhan</given-names>
                                </name>
                                                                    <aff>ÇANAKKALE ONSEKİZ MART ÜNİVERSİTESİ, MÜHENDİSLİK FAKÜLTESİ, BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ, BİLGİSAYAR MÜHENDİSLİĞİ PR.</aff>
                                                            </contrib>
                                                    <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0001-8524-874X</contrib-id>
                                                                <name>
                                    <surname>Karasulu</surname>
                                    <given-names>Bahadir</given-names>
                                </name>
                                                                    <aff>ÇANAKKALE ONSEKİZ MART ÜNİVERSİTESİ, MÜHENDİSLİK FAKÜLTESİ, BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ, BİLGİSAYAR MÜHENDİSLİĞİ PR.</aff>
                                                            </contrib>
                                                    <contrib contrib-type="author">
                                                                    <contrib-id contrib-id-type="orcid">
                                        https://orcid.org/0000-0002-6949-8666</contrib-id>
                                                                <name>
                                    <surname>Ünlü</surname>
                                    <given-names>İsmet</given-names>
                                </name>
                                                                    <aff>ÇANAKKALE ONSEKİZ MART ÜNİVERSİTESİ, MÜHENDİSLİK FAKÜLTESİ, BİLGİSAYAR MÜHENDİSLİĞİ BÖLÜMÜ, BİLGİSAYAR MÜHENDİSLİĞİ PR.</aff>
                                                            </contrib>
                                                                                </contrib-group>
                        
                                        <pub-date pub-type="pub" iso-8601-date="20220831">
                    <day>08</day>
                    <month>31</month>
                    <year>2022</year>
                </pub-date>
                                        <volume>5</volume>
                                        <issue>2</issue>
                                        <fpage>181</fpage>
                                        <lpage>207</lpage>
                        
                        <history>
                                    <date date-type="received" iso-8601-date="20220702">
                        <day>07</day>
                        <month>02</month>
                        <year>2022</year>
                    </date>
                                                    <date date-type="accepted" iso-8601-date="20220706">
                        <day>07</day>
                        <month>06</month>
                        <year>2022</year>
                    </date>
                            </history>
                                        <permissions>
                    <copyright-statement>Copyright © 2018, Sakarya University Journal of Computer and Information Sciences</copyright-statement>
                    <copyright-year>2018</copyright-year>
                    <copyright-holder>Sakarya University Journal of Computer and Information Sciences</copyright-holder>
                </permissions>
            
                                                                                                                        <abstract><p>This study contains an approach for recognizing the sound environment class from a video to understand the spoken content with its sentimental context via some sort of analysis that is achieved by the processing of audio-visual content using multimodal deep learning methodology. This approach begins with cutting the parts of a given video which the most action happened by using deep learning and this cutted parts get concanarated as a new video clip. With the help of a deep learning network model which was trained before for sound recognition, a sound prediction process takes place. The model was trained by using different sound clips of ten different categories to predict sound classes. These categories have been selected by where the action could have happened the most. Then, to strengthen the result of sound recognition if there is a speech in the new video, this speech has been taken. By using Natural Language Processing (NLP) and Named Entity Recognition (NER) this speech has been categorized according to if the word of a speech has connotation of any of the ten categories. Sentiment analysis and Apriori Algorithm from Association Rule Mining (ARM) processes are preceded by identifying the frequent categories in the concanarated video and helps us to define the relationship between the categories owned. According to the highest performance evaluation values from our experiments, the accuracy for sound environment recognition for a given video&#039;s processed scene is 70%, average Bilingual Evaluation Understudy (BLEU) score for speech to text with VOSK speech recognition toolkit&#039;s English language model is 90% on average and for Turkish language model is 81% on average. Discussion and conclusion based on scientific findings are included in our study.</p></abstract>
                                                            
            
                                                                                        <kwd-group>
                                                    <kwd>Multimodal Deep Learning</kwd>
                                                    <kwd>  Association Rule Mining</kwd>
                                                    <kwd>  Named Entity Recognition</kwd>
                                                    <kwd>  Natural Language Processing</kwd>
                                            </kwd-group>
                            
                                                                                                                                                    </article-meta>
    </front>
    <back>
                            <ref-list>
                                    <ref id="ref1">
                        <label>1</label>
                        <mixed-citation publication-type="journal">B. Karakaya, E.B. Boztepe, and B. Karasulu, &quot;Development of a Deep Learning Based Model for Recognizing the Environmental Sounds in Videos,&quot; in The SETSCI Conference Proceedings Book, vol. 5, no. 1, pp. 53-58, 2022.</mixed-citation>
                    </ref>
                                    <ref id="ref2">
                        <label>2</label>
                        <mixed-citation publication-type="journal">B. Karasulu, “Çoklu Ortam Sistemleri İçin Siber Güvenlik Kapsamında Derin Öğrenme Kullanarak Ses Sahne ve Olaylarının Tespiti,” Acta Infologica, vol. 3, no. 2, pp. 60-82, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref3">
                        <label>3</label>
                        <mixed-citation publication-type="journal">E. A. Kıvrak, B. Karasulu, C. Sözbir ve A. Türkay, “Ses Özniteliklerini Kullanan Ses Duygu Durum Sınıflandırma İçin Derin Öğrenme Tabanlı Bir Yazılımsal Araç,” Veri Bilim Dergisi, vol. 4, no. 3, pp.14-27, 2021.</mixed-citation>
                    </ref>
                                    <ref id="ref4">
                        <label>4</label>
                        <mixed-citation publication-type="journal">S. Albawi, T. A. Mohammed, and S. Al-Zawi, “Understanding of a Convolutional Neural Network,” in Proceedings of the International Conference on Engineering and Technology (ICET), Antalya, Turkey, pp. 1-6, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref5">
                        <label>5</label>
                        <mixed-citation publication-type="journal">Y. Zhao, X. Jin, and X. Hu, “Recurrent Convolutional Neural Network for Speech Processing,” in Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5300-5304, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref6">
                        <label>6</label>
                        <mixed-citation publication-type="journal">J. Ngiam, A. Khosla, M. Kim, J. Nam, H. Lee, and A. Y. Ng, “Multimodal Deep Learning,” in Proceedings of the 28th International Conference on Machine Learning (ICML11), Bellevue, Washington, USA, pp. 689–696, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref7">
                        <label>7</label>
                        <mixed-citation publication-type="journal">S. Bird, E. Loper, and J. Baldridge, &quot;Multidisciplinary Instruction with the Natural Language Toolkit,&quot; in Proceedings of the Third Workshop on Issues in Teaching Computational Linguistics, Columbus, Ohio, pp. 62–70, 2008.</mixed-citation>
                    </ref>
                                    <ref id="ref8">
                        <label>8</label>
                        <mixed-citation publication-type="journal">J. Joseph, and J. R. Jeba, &quot;Information Extraction Using Tokenization And Clustering Methods,&quot; International Journal of Recent Technology and Engineering, vol. 8 no. 4, pp. 3680-3692, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref9">
                        <label>9</label>
                        <mixed-citation publication-type="journal">H. van Halteren, J. Zavrel, and W. Daelemans, “Improving Accuracy in NLP Through Combination of Machine Learning Systems,” Computational Linguistics. vol. 27, no. 2, pp. 199–229, 2001.</mixed-citation>
                    </ref>
                                    <ref id="ref10">
                        <label>10</label>
                        <mixed-citation publication-type="journal">A. Roy, “Recent Trends in Named Entity Recognition (NER),” arXiv preprint arXiv:2101.11420 [cs.CL], 2021.</mixed-citation>
                    </ref>
                                    <ref id="ref11">
                        <label>11</label>
                        <mixed-citation publication-type="journal">K. Shaukat, S. Zaheer, and I. Nawaz, “Association Rule Mining: An Application Perspective,” International Journal of Computer Science and Innovation, vol. 2015, no. 1, pp.29-38, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref12">
                        <label>12</label>
                        <mixed-citation publication-type="journal">VOSK Offline Speech Recognition Library Website, 2022, [Online]. Available: https://alphacephei.com/vosk/. [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref13">
                        <label>13</label>
                        <mixed-citation publication-type="journal">Ö. Şahinaslan, H. Dalyan ve E. Şahinaslan, &quot;Naive Bayes Sınıflandırıcısı Kullanılarak YouTube Verileri Üzerinden Çok Dilli Duygu Analizi,&quot; Bilişim Teknolojileri Dergisi, vol. 15, no. 2, pp. 221-229, 2022.</mixed-citation>
                    </ref>
                                    <ref id="ref14">
                        <label>14</label>
                        <mixed-citation publication-type="journal">M.C. Yılmaz ve Z. Orman, &quot;LSTM Derin Öğrenme Yaklaşımı ile Covid-19 Pandemi Sürecinde Twitter Verilerinden Duygu Analizi,&quot; Acta Infologica, vol. 5, no. 2, pp. 359-372. 2021.</mixed-citation>
                    </ref>
                                    <ref id="ref15">
                        <label>15</label>
                        <mixed-citation publication-type="journal">N. Buduma and N. Lacascio, Designing Next-Generation Machine Intelligence Algorithms Fundamentals of Deep Learning, O’Reilly Media UK Ltd., 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref16">
                        <label>16</label>
                        <mixed-citation publication-type="journal">F. Chollet, Deep Learning with Python, Manning Publications, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref17">
                        <label>17</label>
                        <mixed-citation publication-type="journal">Y. Shen, C.-H. Demarty, and N.Q.K. Duong, “Deep Learning for Multimodal-Based Video Interestingness Prediction,” in Proceedings of the IEEE International Conference on Multimedia and Expo (ICME), pp. 1003-1008, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref18">
                        <label>18</label>
                        <mixed-citation publication-type="journal">Y.-G. Jiang, Y. Wang, R. Feng, X. Xue, Y. Zheng, and H. Yang, “Understanding and Predicting Interestingness of Videos,” in Proceedings of the Twenty-Seventh AAAI Conference on Artificial Intelligence, pp. 1113–1119, 2013.</mixed-citation>
                    </ref>
                                    <ref id="ref19">
                        <label>19</label>
                        <mixed-citation publication-type="journal">D. M. Agrawal, H. B. Sailor, M. H. Soni, and H. A. Patil, “Novel TEO-based Gammatone Features for Environmental Sound Classification,” in Proceedings of the 25th European Signal Processing Conference, pp.1859-1863, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref20">
                        <label>20</label>
                        <mixed-citation publication-type="journal">Z. Mushtaq and S.-F. Su, “Efficient Classification of Environmental Sounds through Multiple Features Aggregation and Data Enhancement Techniques for Spectrogram Images,” Symmetry, vol. 12, no. 11:1822, pp. 1-34, 2020.</mixed-citation>
                    </ref>
                                    <ref id="ref21">
                        <label>21</label>
                        <mixed-citation publication-type="journal">DenseNet Documentation, 2022, [Online]. Available: https://github.com/liuzhuang13/DenseNet. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref22">
                        <label>22</label>
                        <mixed-citation publication-type="journal">A. Khamparia, D. Gupta, N.G. Nguyen, A. Khanna, B. Pandey, and P. Tiwari, “Sound Classification Using Convolutional Neural Network and Tensor Deep Stacking Network,” IEEE Access, vol. 7, pp. 7717-7727, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref23">
                        <label>23</label>
                        <mixed-citation publication-type="journal">K.J. Piczak, “Environmental sound classification with convolutional neural networks,” in Proceedings of the IEEE 25th International Workshop on Machine Learning for Signal Processing (MLSP), Boston, MA, USA pp. 1-6. 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref24">
                        <label>24</label>
                        <mixed-citation publication-type="journal">R. A. Khalil, E. Jones, M. I. Babar, T. Jan, M. Haseeb Z., and T. Alhussain, “Speech Emotion Recognition Using Deep Learning Techniques: A Review,” IEEE Access, vol. 7 pp. 117327-117345, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref25">
                        <label>25</label>
                        <mixed-citation publication-type="journal">M. Gygli, H. Grabner, and L. V. Gool, “Video Summarization By Learning Submodular Mixtures Of Objectives,” in Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition (CVPR), Boston, MA, USA, pp. 3090-3098, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref26">
                        <label>26</label>
                        <mixed-citation publication-type="journal">B. A. Plummer, M. Brown, and S. Lazebnik, “Enhancing Video Summarization Via Vision-Language Embedding,” in Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition (CVPR), Honolulu, HI, USA, pp. 1052-1060, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref27">
                        <label>27</label>
                        <mixed-citation publication-type="journal">K. Zhang, W.-L. Chao, F. Sha, and K. Grauman, “Summary Transfer: Exemplar-Based Subset Selection For Video Summarization,” in Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition (CVPR), pp. 1059-1067, 2016.</mixed-citation>
                    </ref>
                                    <ref id="ref28">
                        <label>28</label>
                        <mixed-citation publication-type="journal">K. Petros, and M. Petros, “SUSiNet: See, Understand and Summarize It,” in Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), Long Beach, CA, USA 16-17 June, pp. 809-819, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref29">
                        <label>29</label>
                        <mixed-citation publication-type="journal">Python Programming Language and Python Modules Documentation, 2022, [Online]. Available: https://www.python.org/doc/. [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref30">
                        <label>30</label>
                        <mixed-citation publication-type="journal">Tensorflow Library Documentation, 2022, [Online]. Available: https://www.tensorflow.org/api_docs. [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref31">
                        <label>31</label>
                        <mixed-citation publication-type="journal">Keras Library Documentation, 2022, [Online]. Available: https://keras.io/api/. [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref32">
                        <label>32</label>
                        <mixed-citation publication-type="journal">M. Sandler, A. Howard, M. Zhu, A. Zhmoginov, and L.-C. Chen, “MobileNetV2: Inverted Residuals and Linear Bottlenecks,” in proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Salt Lake City, UT, USA, pp. 4510-4520, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref33">
                        <label>33</label>
                        <mixed-citation publication-type="journal">Python Data Analysis Library (Pandas) Website 2022, [Online]. Available:  https://pandas.pydata.org/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref34">
                        <label>34</label>
                        <mixed-citation publication-type="journal">Library for Visualization with Python (Matplotlib) Website, 2022, [Online]. Available: https://matplotlib.org/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref35">
                        <label>35</label>
                        <mixed-citation publication-type="journal">Python Statistical Data Visualization Library (Seaborn) Website, 2022, [Online]. Available: https://seaborn.pydata.org/introduction.html.  [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref36">
                        <label>36</label>
                        <mixed-citation publication-type="journal">Numerical Library for Python (NumPy), 2022, [Online]. Available: https://numpy.org/. [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref37">
                        <label>37</label>
                        <mixed-citation publication-type="journal">SpaCy Natural Language Processing Library for Python, 2022, [Online]. Available: https://spacy.io/api/doc. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref38">
                        <label>38</label>
                        <mixed-citation publication-type="journal">Manipulate Audio Library (PyDub) Website, 2022, [Online]. Available: https://pydub.com/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref39">
                        <label>39</label>
                        <mixed-citation publication-type="journal">OpenCV Library Documentation, 2022, [Online]. Available: https://docs.opencv.org/4.6.0/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref40">
                        <label>40</label>
                        <mixed-citation publication-type="journal">Moviepy Library Documentation, 2022, [Online]. Available: https://zulko.github.io/moviepy/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref41">
                        <label>41</label>
                        <mixed-citation publication-type="journal">B. McFee, C. Raffel, D. Liang, D. Ellis, M. Mcvicar, E. Battenberg, and O. Nieto, “Librosa: Audio and Music Signal Analysis in Python,” in Proceedings of the Python in Science Conference, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref42">
                        <label>42</label>
                        <mixed-citation publication-type="journal">E. Loper and S. Bird, “NLTK: the Natural Language Toolkit,” in Proceedings of the ACL-02 Workshop on Effective tools and methodologies for teaching natural language processing and computational linguistics, vol. 1, pp. 63-70, 2002.</mixed-citation>
                    </ref>
                                    <ref id="ref43">
                        <label>43</label>
                        <mixed-citation publication-type="journal">Transformers Library Documentation, 2022, [Online]. Available: https://huggingface.co/docs/transformers/main/en/index. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref44">
                        <label>44</label>
                        <mixed-citation publication-type="journal">Difflib module computing deltas for Python, 2022, [Online]. Available: https://docs.python.org/3/library/difflib.html. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref45">
                        <label>45</label>
                        <mixed-citation publication-type="journal">Zeyrek: Morphological Analyzer and Lemmatizer GitHub Website, 2022, [Online], Available: https://github.com/obulat/zeyrek. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref46">
                        <label>46</label>
                        <mixed-citation publication-type="journal">Library for approximate and phonetic matching of strings for Python, 2022, [Online]. Available: https://github.com/jamesturk/jellyfish. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref47">
                        <label>47</label>
                        <mixed-citation publication-type="journal">Gradio Library Documentation, 2022, [Online]. Available: https://gradio.app/docs/. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref48">
                        <label>48</label>
                        <mixed-citation publication-type="journal">K. J. Piczak, “ESC: Dataset for Environmental Sound Classification,” in Proceedings of the 23rd ACM international conference on Multimedia, pp. 1015–1018, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref49">
                        <label>49</label>
                        <mixed-citation publication-type="journal">V. Panayotov, G. Chen, D. Povey and S. Khudanpur, “Librispeech: An ASR corpus based on public domain audio books,” in Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), South Brisbane, QLD, Australia, pp. 5206 - 5210, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref50">
                        <label>50</label>
                        <mixed-citation publication-type="journal">T. M. Hospedales, S. Gong and T. Xiang, &quot;Learning Tags from Unsegmented Videos of Multiple Human Actions,&quot; in Proceedings of the IEEE 11th International Conference on Data Mining, Vancouver, BC, Canada, pp. 251-259, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref51">
                        <label>51</label>
                        <mixed-citation publication-type="journal">Youtube. 2022. [Online]. Available: https://www.youtube.com. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref52">
                        <label>52</label>
                        <mixed-citation publication-type="journal">R. Kolobov et al., “MediaSpeech: Multilanguage ASR Benchmark and Dataset,” arXiv preprint  arXiv:2103.16193, 2021.</mixed-citation>
                    </ref>
                                    <ref id="ref53">
                        <label>53</label>
                        <mixed-citation publication-type="journal">M. Rochan, L. Ye, and Y. Wang, “Video Summarization Using Fully Convolutional Sequence Networks,” in Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds) Computer Vision – ECCV 2018. ECCV 2018. Lecture Notes in Computer Science, vol. 11216. pp 358–374, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref54">
                        <label>54</label>
                        <mixed-citation publication-type="journal">S. Jadon and M. Jasim, &quot;Unsupervised video summarization framework using keyframe extraction and video skimming,&quot; in Proceedings of the IEEE 5th International Conference on Computing Communication and Automation (ICCCA), Greater Noida, UP, India, Oct 30-31, pp. 140-145, 2020.</mixed-citation>
                    </ref>
                                    <ref id="ref55">
                        <label>55</label>
                        <mixed-citation publication-type="journal">J. Park, J. Lee, S. Jeon, and K. Sohn, &quot;Video Summarization by Learning Relationships between Action and Scene,&quot; in Proceedings of the IEEE/CVF International Conference on Computer Vision Workshop (ICCVW), Seoul, Korea (South), 27-28 October, pp. 1545-1552, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref56">
                        <label>56</label>
                        <mixed-citation publication-type="journal">Z. Li, G. M. Schuster, A. K. Katsaggelos, and B. Gandhi, &quot;Rate-distortion optimal video summarization: a dynamic programming solution,&quot; in Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, Montreal, QC, Canada, vol. 3, pp. iii-457, 2004.</mixed-citation>
                    </ref>
                                    <ref id="ref57">
                        <label>57</label>
                        <mixed-citation publication-type="journal">S. Lu, M. R. Lyu, and I. King, &quot;Video summarization by spatial-temporal graph optimization,&quot; in Proceedings of the 2004 IEEE International Symposium on Circuits and Systems (ISCAS), Vancouver, BC, Canada, pp. II-197, 2004.</mixed-citation>
                    </ref>
                                    <ref id="ref58">
                        <label>58</label>
                        <mixed-citation publication-type="journal">D. Potapov, M. Douze, Z. Harchaoui, and C. Schmid, &quot;Category-specific video summarization&quot;, in Proceedings of the European Conference on Computer Vision (ECCV), Zurich, Switzerland, 6-12 September, pp. 540–555, 2014.</mixed-citation>
                    </ref>
                                    <ref id="ref59">
                        <label>59</label>
                        <mixed-citation publication-type="journal">M. Otani, Y. Nakashima, E. Rahtu, and J. Heikkila, &quot;Rethinking the Evaluation of Video Summaries,&quot; in Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, pp. 7588-7596, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref60">
                        <label>60</label>
                        <mixed-citation publication-type="journal">K. Simonyan and A. Zisserman, “Very Deep Convolutional Networks for Large-Scale Image Recognition”, arXiv preprint arXiv:1409.1556v6 [cs.CV], 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref61">
                        <label>61</label>
                        <mixed-citation publication-type="journal">K. Zhou, Y. Qiao and T. Xiang, “Deep Reinforcement Learning for Unsupervised Video Summarization with Diversity-Representativeness Reward,” arXiv preprint arXiv:1801.00054, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref62">
                        <label>62</label>
                        <mixed-citation publication-type="journal">Kernel Temporal Segmentation (KTS). 2022. [Online]. Available:  https://github.com/TatsuyaShirakawa/KTS.  [Accessed: 01-July-2022]</mixed-citation>
                    </ref>
                                    <ref id="ref63">
                        <label>63</label>
                        <mixed-citation publication-type="journal">Y. Song, J. Vallmitjana, A. Stent, and A. Jaimes, &quot;TVSum: Summarizing web videos using titles,&quot; in Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Boston, MA, USA, 07-12 June, pp. 5179-5187, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref64">
                        <label>64</label>
                        <mixed-citation publication-type="journal">R. Andonov, V. Poirriez, and S. Rajopadhye, &quot;Unbounded knapsack problem: Dynamic programming revisited,&quot; European Journal of Operational Research, vol. 123, no. 2, pp. 394-407, 2000.</mixed-citation>
                    </ref>
                                    <ref id="ref65">
                        <label>65</label>
                        <mixed-citation publication-type="journal">M. Gygli, H. Grabner, H. Riemenschneider, and L. van Goo, “Creating Summaries From User Videos,” in Proceedings of the European Conference on Computer Vision (ECCV), Zurich, Switzerland, 6-12 September, pp. 505–520, 2014.</mixed-citation>
                    </ref>
                                    <ref id="ref66">
                        <label>66</label>
                        <mixed-citation publication-type="journal">P. Musa, F. Rafi, and M. Lamsani, “A Review: Contrast-Limited Adaptive Histogram Equalization (CLAHE) Methods to Help the Application of Face Recognition,” in Proceedings of the Third International Conference on Informatics and Computing (ICIC), Palembang, Indonesia, 17-18 October, pp. 1-6, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref67">
                        <label>67</label>
                        <mixed-citation publication-type="journal">Z. Zhang, S. Xu, S. Zhang, T. Qiao, and S. Cao, “Learning Attentive Representations for Environmental Sound Classification,” IEEE Access, vol. 7, pp. 130327 - 130339, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref68">
                        <label>68</label>
                        <mixed-citation publication-type="journal">Ö. Eski·dere ve F. Ertaş, “Mel Frekansı Kepstrum Katsayilarındaki· Deği·şi·mleri·n Konuşmacı Tanımaya Etki·si·,” Uludağ Üniversitesi Mühendislik-Mimarlık Fakültesi Dergisi, vol. 14, no. 2, pp. 93-110, 2009.</mixed-citation>
                    </ref>
                                    <ref id="ref69">
                        <label>69</label>
                        <mixed-citation publication-type="journal">Md. A. Hossan, S. Memon, and M. A. Gregory, “A Novel Approach for MFCC Feature Extraction,” in Proceedings of the 4th International Conference on Signal Processing and Communication Systems (ICSPCS), Gold Coast, QLD, Australia, 13-15 December, pp. 1-5, 2010.</mixed-citation>
                    </ref>
                                    <ref id="ref70">
                        <label>70</label>
                        <mixed-citation publication-type="journal">N. Jiang, P.Grosche, V. Konz, and M. Müller, &quot;Analyzing chroma feature types for automated chord recognition&quot;, in Proceedings of the 42nd AES International Conference on Semantic Audio. Ilmenau, Germany, pp. 285-294, 22-24 July, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref71">
                        <label>71</label>
                        <mixed-citation publication-type="journal">Rotating Images Information Website, 2022, [Online]. Available: https://datagenetics.com/blog/august32013/index.html. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref72">
                        <label>72</label>
                        <mixed-citation publication-type="journal">Y. Bengio, A. Courville, and Pa. Vincent, “Representation Learning: A Review and NewPerspectives,” IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 34, no. 8, pp. 1798-1828, 2013.</mixed-citation>
                    </ref>
                                    <ref id="ref73">
                        <label>73</label>
                        <mixed-citation publication-type="journal">J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei-Fei, “ImageNet: A Large-Scale Hierarchical Image Database,” in Proceedings of 2009 IEEE Conference on Computer Vision and Pattern Recognition, Miami, FL, USA, pp. 248-255, 20-25 June, 2009.</mixed-citation>
                    </ref>
                                    <ref id="ref74">
                        <label>74</label>
                        <mixed-citation publication-type="journal">D. P. Kingma, and J. Ba, “Adam: A Method for Stochastic Optimization,” in Proceedings of the 3rd International Conference for Learning Representations, San Diego, USA, pp. 1-13, 2015.</mixed-citation>
                    </ref>
                                    <ref id="ref75">
                        <label>75</label>
                        <mixed-citation publication-type="journal">S. Albelwi and A. Mahmood, “A framework for designing the architectures of deep convolutional neural networks,” Entropy, vol. 19, no. 6:242, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref76">
                        <label>76</label>
                        <mixed-citation publication-type="journal">M. Folk, G. Heber, Q. Koziol, E. Pourmal, and D. Robinson, “An Overview of the HDF5 Technology Suite and its Applications,” in Proceedings of the EDBT/ICDT 2011 Workshop on Array Databases, Uppsala, Sweden, March 25, pp. 36-47, 2011.</mixed-citation>
                    </ref>
                                    <ref id="ref77">
                        <label>77</label>
                        <mixed-citation publication-type="journal">M. Mednis and M. K. Aurich, “Application of String Similarity Ratio and Edit Distance in Automatic Metabolite Reconciliation Comparing Reconstructions and Models,” Biosystems and Information Technology, vol.1, no.1, pp. 14-18, 2012.</mixed-citation>
                    </ref>
                                    <ref id="ref78">
                        <label>78</label>
                        <mixed-citation publication-type="journal">K. Dreßler and A.-C. Ngonga Ngomo, “On the Ef?cient Execution of Bounded Jaro-Winkler Distances,” Semantic Web, Issue title: Ontology and linked data matching, vol. 8, no 2, pp 185–196, 2017.</mixed-citation>
                    </ref>
                                    <ref id="ref79">
                        <label>79</label>
                        <mixed-citation publication-type="journal">K. Papineni, S. Roukos, T. Ward, and W.-J. Zhu, “BLEU: a Method for Automatic Evaluation of Machine Translation,” in Proceedings of the 40th Annual Meeting on Association for Computational Linguistics, Philadelphia Pennsylvania, USA, 7 - 12 July, pp. 311–318, 2002.</mixed-citation>
                    </ref>
                                    <ref id="ref80">
                        <label>80</label>
                        <mixed-citation publication-type="journal">C. Callison-Burch, M. Osborne, and P. Koehn, “Re-evaluating the Role of BLEU in Machine Translation Research,” in Proceedings of the 11th Conference of the European Chapter of the Association for Computational Linguistics (EACL), Trento, Italy, 3-7 April, pp. 249-256, 2006.</mixed-citation>
                    </ref>
                                    <ref id="ref81">
                        <label>81</label>
                        <mixed-citation publication-type="journal">F. Rahutomo, T. Kitasuka, and M. Aritsugi, “Semantic Cosine Similarity,” in Proceedings of the 7th International Student Conference on Advanced Science and Technology ICAST, Seoul, South Korea, 2012.</mixed-citation>
                    </ref>
                                    <ref id="ref82">
                        <label>82</label>
                        <mixed-citation publication-type="journal">J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, &quot;BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding,&quot; Computation and Language (cs.CL), arXiv preprint arXiv:1810.04805 [cs.CL], 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref83">
                        <label>83</label>
                        <mixed-citation publication-type="journal">Hugging Face Services Documentation, 2022, [Online]. Available: https://huggingface.co/docs. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref84">
                        <label>84</label>
                        <mixed-citation publication-type="journal">Roberta Sentiment Model Documentation, 2022, [Online]. Available: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref85">
                        <label>85</label>
                        <mixed-citation publication-type="journal">BERT-Turkish Sentiment Model Documentation, 2022, [Online]. Available: https://huggingface.co/savasy/bert-base-turkish-sentiment-cased. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref86">
                        <label>86</label>
                        <mixed-citation publication-type="journal">S. Yildirim, “Comparing Deep Neural Networks to Traditional Models for Sentiment Analysis in Turkish Language,” In: B. Agarwal, R. Nayak, N. Mittal, and S. Patnaik, (eds) Deep Learning-Based Approaches for Sentiment Analysis. Algorithms for Intelligent Systems. Springer, Singapore, pp. 311-319, 2020.</mixed-citation>
                    </ref>
                                    <ref id="ref87">
                        <label>87</label>
                        <mixed-citation publication-type="journal">S. Sarica and J. Luo, “Stopwords in Technical Language Processing,” Plos One, vol.16, no.8, pp. 1-13, 2021.</mixed-citation>
                    </ref>
                                    <ref id="ref88">
                        <label>88</label>
                        <mixed-citation publication-type="journal">S. Panjaitan, Sulindawaty, M. Amin, S. Lindawati, R. Watrianthos, H. T. Sihotang, and B. Sinaga, “Implementation of Apriori Algorithm for Analysis of Consumer Purchase Patterns,” in Proceedings of the International Conference on Computer Science and Applied Mathematic, IOP Conf. Series: Journal of Physics: Conf. Series, vol. 1255, no. 1, pp. 1-8, 2019.</mixed-citation>
                    </ref>
                                    <ref id="ref89">
                        <label>89</label>
                        <mixed-citation publication-type="journal">AVESA GitHub Repository, 2022, [Online]. Available: https://github.com/berayboztepe/AVESA. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref90">
                        <label>90</label>
                        <mixed-citation publication-type="journal">Pexels Website, 2022, [Online]. Available: https://www.pexels.com. [Accessed: 01-July-2022].</mixed-citation>
                    </ref>
                                    <ref id="ref91">
                        <label>91</label>
                        <mixed-citation publication-type="journal">B. Karasulu, “Kısıtlanmış Boltzmann makinesi ve farklı sınıflandırıcılarla oluşturulan sınıflandırma iş hatlarının başarımının değerlendirilmesi”, Bilişim Teknolojileri Dergisi, vol. 11, no. 3, pp. 223-233, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref92">
                        <label>92</label>
                        <mixed-citation publication-type="journal">A. Ali and S. Renals, “Word Error Rate Estimation for Speech Recognition: e-WER,” in Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, Melbourne, Australia, 15 - 20 July, pp. 20-24, 2018.</mixed-citation>
                    </ref>
                                    <ref id="ref93">
                        <label>93</label>
                        <mixed-citation publication-type="journal">T. Fawcett, “Introduction to ROC analysis,” Pattern Recognition Letters, vol. 27, no. 8, pp. 861-874, 2006.</mixed-citation>
                    </ref>
                                    <ref id="ref94">
                        <label>94</label>
                        <mixed-citation publication-type="journal">D. M. W. Powers, “The Problem of Area Under the Curve,” in Proceedings of the IEEE International Conference on Information Science and Technology (ICIST), Wuhan, China, 23-25 March, pp. 567-573, 2012.</mixed-citation>
                    </ref>
                            </ref-list>
                    </back>
    </article>
