@article{article_844862, title={Comparison of Different Classification Algorithms for Extraction Information from Invoice Images Using an N-Gram Approach}, journal={Avrupa Bilim ve Teknoloji Dergisi}, pages={991–1003}, year={2021}, DOI={10.31590/ejosat.844862}, author={Nasiboglu, Resmiye and Akdoğan, Adem}, keywords={Machine learning, Information extraction, N-gram, Levenshtein distance, Jaro-Winkler distance}, abstract={Artificial intelligence (AI) has started to be used in many areas today. One of these areas is the accounting sector. Accounting companies may sometimes be inadequate especially in the face of intense invoicing transactions of large companies. This problem raised the need to process invoices by an Artificial Intelligence powered system. The goal of this work is to determine the best machine learning model to extract information such as invoice number, invoice date, due date, delivery date, total gross, total net, vat amount and IBAN from the invoice image files. Information obtained by the Tesseract Optical Character Recognition (OCR) system has been converted into n-gram format. A number of attributes of the n-gram are calculated such as the coordinates, the length, the width, the line number, the template information of n-grams, the Levenshtein and the Jaro-Winkler distances between the candidate n-grams and the keywords in the control keywords list. The use of the Levenshtein distance between candidate n-grams and the control keywords has resulted in a sufficiently high predictive rate. The most appropriate model and features are determined for the training. Algorithms such as Random Forest, Gradient Boosting Machine, Extreme Gradient Boosting, K-Nearest Neighbors, AdaBoost and Decision Tree were compared as prediction models. A total of 9910 invoices were used by splitting 80% for training and 20% for testing. It was observed that the Random Forest model using the Levenshtein distance is the best model with an average F1 score of 0.9137.}, number={31}, publisher={Osman SAĞDIÇ}