#' Queries the requests made through the Brazilian Right to Information Law
#'
#' Downloads data for the selected years, apply a filter and return it in the form of a dataframe.
#'
#' @importFrom utils download.file unzip
#'
#' @param year selects the years which data will be downloaded
#' @param search selects the keyword to be searched
#'
#' @return a dataframe with requests containing the keyword
#' @examples
#' \dontrun{requests(search = 'PAC')}
#' @export
requests <- function(year = 'all', search) {
  old <- Sys.time() # to calculate execution time
  year.options <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022)
  links <- paste0('https://dadosabertos-download.cgu.gov.br/FalaBR/Arquivos_FalaBR_Filtrado/Arquivos_csv_', year.options, '.zip')
  # if the user does not enter the year, data for all years will be downloaded
  if (year == 'all') {
    year <- year.options
  }
  protocolo <- palavras <- NULL
  `%!in%` = Negate(`%in%`) # creates operator
  if(sum(stringr::str_count(search, '\\w+')) > 1){
    search <- unlist(strsplit(search, split = " "))
    search <- search[search %!in% stopwords::stopwords('portuguese')] # remove the stopwords
  }
  search <- tolower(search)
  tabela <- data.frame(matrix(NA, nrow = 0, ncol = 21)) # Create empty data frame
  nomes.colunas <- c('id_pedido','protocolo','esfera','orgao','situacao','data_registro','resumo','detalhamento','prazo',
                     'foi_prorrogado','foi_reencaminhado','forma_resposta','origem_da_solicitacao','id_solicitante',
                     'assunto','sub_assunto','tag','data_resposta','resposta','decisao','especificacao_decisao')
  colnames(tabela) <- nomes.colunas
  tabela <- tabela %>%
    dplyr::select(2,4:13,15,18:21)
  dir.temp <- tempdir()

  # allows to include more than one year at a time with a vector
  for(i in year) {
    year <- paste0('link', i)
    x <- c('link2015','link2016','link2017','link2018','link2019','link2020','link2021','link2022')
    x <- match(year, x) # returns the position of the matching string

    # check if the files of the years have already been downloaded
    lista.arquivos.locais <- list.files(path = dir.temp, pattern = "*.csv", full.names = TRUE)

    # used to select the file to extract
    data <- Sys.Date()
    pontos <- "a1~!@#$%^&*(){}_+:\"<>?,./;'[]-="
    dia.arquivo <- stringr::str_replace_all(data, "[[:punct:]]", "")

    # download_lai
    #
    # Download data from the CGU for the selected years.
    download_lai <- function() {
      download.file(links[x], paste(dir.temp, stringr::str_sub(links[x],start = -21), sep = '\\')) # fazer com que o nome do arquivo seja dinâmico
    }

    # checks if the file has been previously downloaded
    if(any(grepl(paste0('Pedidos_csv_', i), lista.arquivos.locais)) == T) {
      print(paste0('Os arquivos de ', i,' foram baixados anteriormente.'))
    } else{
      download_lai()
      # list zip files from the year
      lista.arquivos <- list.files(path = dir.temp, pattern = paste0("Arquivos_csv_", i, ".zip"), full.names = TRUE)
      # extract only requests files the downloaded files
      unzip(zipfile = lista.arquivos, exdir = dir.temp, files = paste0(dia.arquivo,"_Pedidos_csv_",i,".csv"))
    }

    # read the files
    lista.arquivos.locais <- list.files(path = dir.temp, pattern = "*.csv", full.names = TRUE)
    caminho.arquivo <- stringr::str_subset(lista.arquivos.locais, paste0("Pedidos_csv_",i))
    var <- readr::read_csv2(file = caminho.arquivo, col_names = FALSE, quote = '\'', locale = readr::locale(encoding="UTF-16LE"))
    colnames(var) <- nomes.colunas
    var <- var %>%
      dplyr::select(2,4:13,15,18:21)
    tabela <- rbind(tabela, var)
    rm(list = 'var') # remove variável para liberar RAM
  }

  # Optimize search to reduce RAM consumption
  tabela.final <-  data.frame(matrix(NA, nrow = 0, ncol = 21)) # Create empty data frame
  colnames(tabela.final) <- nomes.colunas
  tabela.final <- tabela.final %>%
    dplyr::select(2,4:13,15,18:21)

  n <- 10000
  nr <- nrow(tabela)
  lista.tabelas <- split(tabela, rep(1:ceiling(nr/n), each = n, length.out = nr))
  rm(list = 'tabela')

  for(i in 1:length(lista.tabelas)){
    # creates a partial table
    tabela.parcial <- as.data.frame(lista.tabelas[i]) %>%
      tidytext::unnest_tokens('palavras', paste0('X', i,'.detalhamento'), drop = F) %>%
      dplyr::filter(palavras %in% search) %>%
      unique()

    colnames(tabela.parcial) <- c('protocolo','orgao','situacao','data_registro','resumo','detalhamento','prazo',
                                  'foi_prorrogado','foi_reencaminhado','forma_resposta','origem_da_solicitacao',
                                  'assunto','data_resposta','resposta','decisao','especificacao_decisao','palavras')

    tabela.final <- rbind(tabela.final, tabela.parcial)
  }

  if(sum(stringr::str_count(search, '\\w+')) > 1){
    count <- tabela.final %>%
      dplyr::group_by(protocolo) %>%
      dplyr::count()

    tabela.final <- tabela.final %>%
      dplyr::left_join(count) %>%
      dplyr::filter(n >= sum(stringr::str_count(search, '\\w+')))
  }

  tabela.final <- tabela.final %>% dplyr::select(1:16) %>% unique()
  new <- Sys.time() - old # calculate difference
  print(paste0('Consulta finalizada em ', round(new, 2),' segundos.'))
  print(paste0('Query completed in ', round(new, 2),' seconds'))
  return(tabela.final)
}
