Read Tables in Chunks

Read tables chunk by chunk using a C++ backend and a simple R interface.


Build Status AppVeyor Build Status R Coverage status







This package allows to read large text tables in chunks, using a fast C++ backend. Text files can be imported as data frames (with automatic column type detection option) or matrices. The program is designed to be simple and user-friendly.

chunkR is based on three basic functions: chunker, to create a connection to a text file; next_chunk() to read the next chunk, and get_table() to retrieve the corresponding data chunk.

These functions can be easily included in loops and other source code, using the return value of the next_chunk() function, that is TRUE when a new chunk is available and FALSE when the file was totally read, respectively. The get_table() function, returns an empty data frame/matrix when next_chunk() is FALSE. See the examples below.

Installation

library(devtools)
install_github("leandroroser/chunkR")

Examples

data(iris)
tmp_path <- file.path(tempdir(),"iris.txt")
write.table(iris, tmp_path, quote = FALSE)
+#-----------------------------------------------------------------#
+#--- Reading a data frame with automatic column-type detection ---#
+#-----------------------------------------------------------------#
# create a 'chunker' object passing the path of the input file.
my_chunker_object <- chunker(tmp_path, chunksize = 30)
 
# read a chunk
next_chunk(my_chunker_object)
 
# get the chunk
 get_table(my_chunker_object)
 
# read another chunk
next_chunk(my_chunker_object)
 
# get the number of lines already read
get_completed(my_chunker_object)
-#---- Quoted data --------#
write.table(iris, tmp_path, quote = TRUE)
 
my_chunker_object <- chunker(tmp_path, quoted = TRUE, chunksize = 30)
 
next_chunk(my_chunker_object)
 
get_table(my_chunker_object)
-#---- Data without rownames and/or colnames ----#
tmp_path <- file.path(tempdir(),"iris.txt")
write.table(iris, tmp_path, row.names = FALSE, col.names = FALSE)
 
my_chunker_object2 <- chunker(tmp_path, quoted = TRUE, chunksize = 30,
has_rownames = FALSE, has_colnames = FALSE)
 
next_chunk(my_chunker_object2)
 
get_table(my_chunker_object2) # automatic generation of rownames and/or colnames
-#--- read a csv file ---#
tmp_path_csv <- file.path(tempdir(),"iris.csv")
 
write.table(iris, tmp_path_csv, quote = FALSE, sep = ",")
 
# read the csv indicating the value of the sep parameter
my_chunker_object3 <- chunker(tmp_path_csv, chunksize = 30, sep = ",")
# the file can  then be processed as with tab delimiters
 
next_chunk(my_chunker_object3)
get_table(my_chunker_object3)
 
# remove temporal file
file.remove(tmp_path_csv)
+#--------------------------------------------------------#
+#--- Reading a data frame using column types argument ---#
+#--------------------------------------------------------#
## Four types can be passed : "character", "numeric" (aka "double"), "integer", "logical"
 
# create a 'chunker' object passing the path of the input file.
my_chunker_object4 <- chunker(tmp_path, chunksize = 120,
 columns_classes = c("numeric", "numeric", "numeric","numeric", "character"))
 
# read a chunk
next_chunk(my_chunker_object4)
 
# get the chunk
get_table(my_chunker_object4)
 
# read another chunk
next_chunk(my_chunker_object4)
 
# get the number of lines already read
get_completed(my_chunker_object4)
+#-------------------------#
+#--- Reading a matrix  ---#
+#-------------------------#
my_chunker_object5 <- chunker(tmp_path, chunksize = 30, data_format= "matrix")
 
# read a chunk
next_chunk(my_chunker_object5)
 
# store the chunk as a character matrix in R
this_data <- get_table(my_chunker_object5)
 
 
# The package provides a fast generic C++ function for conversion from
# matrix (any R type) to data frame
this_data_as_df2 <- matrix2df(this_data)
 
# remove temporal file
file.remove(tmp_path)
+#----------------------------------#
+#--- Example with a big table -----#
+#----------------------------------#
-### Example with a data frame
# create a large data frame, and write it in a temporal directory
 
tmp_path <- file.path(tempdir(),"big_table.txt")
 
out <- data.frame(numeric_data = runif(1000000),
                  character_data = sample(c("a", "t", "c", "g"), 1000000, 
                  replace = TRUE),
                  integer_data = sample(1000000),
                  bool_data = sample(c(TRUE, FALSE), 1000000, replace = TRUE))
 
 
write.table(out, tmp_path, quote = FALSE)
 
# create a chunker object, reading in chunks of 10000 lines
my_chunker_object6 <- chunker(tmp_path, chunksize = 10000)
 
next_chunk(my_chunker_object6)
data <- get_table(my_chunker_object6) 
 
# check classes
lapply(data,typeof)
file.remove(tmp_path)
-### Example with a matrix
# create a large matrix, and write it in a temporal directory
 
my_table <- tempfile()
write.table(matrix(sample(c("a", "t", "c", "g"), 1000000, replace = TRUE), 
100000, 1000), my_table, quote = FALSE)
 
# create a chunker object, reading in chunks of 10000 lines
my_chunker_object7 <- chunker(my_table, chunksize = 10000, data_format= "matrix")
 
# create a loop to read all the file and do something with it
 
lines <- 0
while(next_chunk(my_chunker_object7))
{
  data <- get_table(my_chunker_object7) 
  
  # do something with data, e.g., convert to data frame first
  data <- matrix2df(data)
  
  lines <- lines + nrow(data)
  cat("Processed ", lines, "lines\n")
}
 
# remove the temporal file
file.remove(my_table)
 
 

News

chunkR 1.1.1 (Release date: 2018-02-21)

Changes:

  • Added detection of end of line character for cross platform compatibility.
  • Correction of return values in exported functions.

chunkR 1.1.0 (Release date: 2018-01-21)

Changes:

  • The reader class was renamed as chunker for consistency with the package.
  • The reader function was renamed as chunker.
  • New functionality to read dataframes direcly from the source file. The newinterface does not requires to read the data as a text matrix prior to the obtention of a dataframe.
  • The functions get_matrix and get_dataframe where converted into internals and substituted at user level by the funcion get_table that can retrieve both dataframe and matrix data types.
  • Function get_matrix2dataframe was deprecated.
  • Correction of bugs.

Reference manual

It appears you don't have a PDF plugin for this browser. You can click here to download the reference manual.

install.packages("chunkR")

1.1.1 by Leandro Roser, a year ago


Browse source code at https://github.com/cran/chunkR


Authors: Leandro Roser [aut, cre]


Documentation:   PDF Manual  


GPL (>= 2) license


Imports Rcpp

Depends on methods

Suggests testthat, covr

Linking to Rcpp


See at CRAN