@article{article_1555903, title={Spam Mail Detection in Turkish and English Languages: A Holistic Study of AI-based Techniques including Individual, Ensemble and Hybrid Approaches}, journal={Necmettin Erbakan Üniversitesi Fen ve Mühendislik Bilimleri Dergisi}, volume={7}, pages={189–205}, year={2025}, author={Candan, Esma Nisa and Küçükilhan, Rehnüma and Eroğlu, Alperen}, keywords={English Datasets, Ensemble Learning, Hybrid Learning, Turkish Datasets, Spam Mail}, abstract={Spam has surged due to increased email and social media use, posing a critical challenge in effectively detecting and classifying this growing volume without causing harm to systems. This paper presents a holistic strategy to analyze and reveal the most efficient approaches for detecting and classifying e-mails as spam or ham by using Turkish and English datasets. We use two different datasets generated in different languages in addition to conjunctively generated new datasets. We make a comparative study to find out the best spam mail detection approaches based on our enhanced machine learning and deep learning methods. We also bring ensemble and hybrid learning methods together as a new approach for spam mail detection. We utilize natural language processing, and improved learning algorithms with optimized feature selection approaches and preprocessing. We compare various methods commonly used in the literature which are Multinomial Naive Bayes, Support Vector Machine, Logistic Regression, K-Nearest Neighbors, Decision Tree, Random Forest, Voting classifier, and Stacking classifier as machine learning algorithms, and Long Short Term Memory, Bidirectional Long Short Term Memory, Bidirectional Encoder Representations from Transformers as deep learning algorithms. We split the datasets as train data and test data with the 80:20 ratios in addition to 5-fold cross-validation for each model. We also optimize the hyperparameters of our models by using Grid Search. The ensemble method based on machine learning approaches provides the best performances which are the percentage of 99.9% for the English Enron dataset, and the hybrid ensemble approach based on simple average yields the best accuracy value of 98.43% for the Turkish dataset from UCI and Kaggle.}, number={2}, publisher={Necmettin Erbakan University}