Intelligent Document Processing | Off-Topic

#include <iostream>
#include <map>
#include <string>
#include <sstream>
#include <fstream>
#include <cctype>
#include <unordered_set>
#include <algorithm>
#include <cctype>
#include <bits/stdc++.h>
#include <math.h>
using namespace std;

map<string, double> wordQueries;
vector<string> ret;
vector<double> lengthOfLine;

vector<string> split(const string &s)
{
    typedef string::size_type string_size;
    string_size i = 0;

    while (i != s.size())
    {
        while (i != s.size() && isspace(s[i]))
            ++i;
        string_size j = i;

        while (j != s.size() && !isspace(s[j]))
            j++;
        if (i != j)
        {
            ret.push_back(s.substr(i, j - i));
            i = j;
        }
    }

    return ret;
}

string Keyword(string token, string fileName, double tf)
{
    map<string, double> lineSelection;
    std::ifstream ifs(fileName);
    string line;
    while (getline(ifs, line))
    {
        if (line.find(token) != string::npos)
        {
            split(line);
            int N = ret.size();

            double tfnorm = wordQueries.find(token)->second;
            /*
            N = total number of words in the collection
            df = document frequency of word w
            idf = log[(N - df + 0.5)/(df + 0.5)]
            */
            double varidf = log((N - tf + 0.5) / (tf + 0.5));

            /*
            Combining TF and IDF
                        Just multiply the idf and the tf normalizations. The idf
                        normalization is global, depending on the distribution of
                        individual words in the collection. The tf normalization is local,
                        and its purpose is to dampen the effect of words that appear
                        too many times.
            */
            double normalization = tfnorm * varidf;

            lineSelection.insert(std::pair<string, double> (line, normalization));
        }
    }

    std::map<string, double>::iterator best = std::max_element(lineSelection.begin(), lineSelection.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string googLine = best->first;
    return googLine;
}

std::string remove_stop_words(const std::string &src, const std::unordered_set<std::string > &stops)
{
    std::string retval;
    std::istringstream strm(src);
    std::string word;
    while (strm >> word)
    {
        if (!stops.count(word))
            retval += word + " ";
    }

    if (!retval.empty())
        retval.pop_back();
    return retval;
}

int main()
{
    string insertit;
    double tf;
    std::string fileName = "myfile.txt";
    std::ifstream inFile(fileName);

    int wordCount = 0;
    int lineNum = 0;

    std::unordered_map<std::string, int> words;

    std::string str;
    cout << "Please enter your question " << endl;
    getline(cin, insertit);

    std::unordered_set<std::string > stops = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
        "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
        "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here",
        "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
        "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
        "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
        "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
        "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while",
        "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" };
    insertit = remove_stop_words(insertit, stops);

    /*french stop word list ::

        std::unordered_set<std::string > stops = {"alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon", "car", "ce", "cela", "ces", "ceux", "chaque", "ci",
    "comme", "comment", "dans", "des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos", "début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu",
    "fait", "faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le", "les", "leur", "là", "ma", "maintenant", "mais", "mes", "mien", "moins", "mon", "mot",
    "même", "ni", "nommés", "notre", "nous", "ou", "où", "par", "parce", "pas", "peut", "peu", "plupart", "pour", "pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels",
    "qui", "sa", "sans", "ses", "seulement", "si", "sien", "son",  "sont", "sous", "soyez", "sujet", "sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop",
    "très", "tu", "voient", "vont", "votre", "vous", "vu", "ça", "étaient", "état", "étions", "été", "être" };

    */

    /*
    In computing, stop words are words that are filtered out before or after the natural language data (text)
    are processed. While stop words typically refers to the most common words in a language, all-natural language
    processing tools don't use a single universal list of stop words.
    */
    split(insertit);
    int M = ret.size();

    while (getline(inFile, str))
    {
        std::stringstream ss(str);
        while (ss)
        {
            std::string s;
            ss >> s;
            std::transform(s.begin(), s.end(), s.begin(),
            [](unsigned char c)
                {
                    return std::tolower(c);
    });
            s.erase(std::remove_if(s.begin(), s.end(),
                [](unsigned char c)
                    {
                        return std::isalpha(c) == 0;
}),
                s.end());
            if (!s.empty())
            {
                ++wordCount;
                ++words[s];
            }
        }

        ++lineNum;
        int lineSize = str.length();
        lengthOfLine.push_back(lineSize);
    }

    auto n = lengthOfLine.size();
    double avg_doc_len = 0.0f;
    if (n != 0)
    {
        avg_doc_len = accumulate(lengthOfLine.begin(), lengthOfLine.end(), 0.0) / n;
    }

    double doclen_correction = 0.75 *wordCount / avg_doc_len + 0.25;

    for (auto &pair: words)
    {
        for (int j = 0; j < M; j++)
        {
            if (pair.first == ret[j])
            {
                tf = pair.second / wordCount;
                double tfnorm = tf / (tf + doclen_correction);
                /*
                tf = number of times word appears in a document, the term frequency
                doclen = number of total words (including duplicates) in a document, the document length
                avg_doc_len = average of all document lengths in the collection
                doclen_correction = 0.75*doclen/avg_doclen + 0.25
                tfnorm= tf/(tf + doclen_correction)
                */
                wordQueries.insert(std::pair<string, double> (pair.first, tfnorm));
            }
        }
    }

    std::map<string, double>::iterator best = std::max_element(wordQueries.begin(), wordQueries.end(), [](const std::pair<string, double> &a, const std::pair<string, double> &b)->bool
    {
        return a.second < b.second;
    });
    string token = best->first;
    cout << "Let's talk about "" << token << ""... " << endl;
    cout << Keyword(token, fileName, tf) << endl;
    return 0;
}

Leave a Comment