Distinguish binary and text files in C

This example C program demonstrates a simple way to distinguish binary and text files. This example uses An example of mmap for a read-only file. For brevity, the comments have been removed. Please see the above-mentioned page for a commented version of the code which maps a file to memory.

#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/mman.h> 

/* Maximum bytes to check for low bytes. */

#define MAX 0x1000

/* Return 1 if binary, 0 if probably not binary. */

int
test_binary (const unsigned char * c, size_t len)
{
    int i;
    size_t max = MAX;

    /* Make sure not to overshoot the end of the file while looking
       for bad bytes. */

    if (len < max) {
	max = len;
    }

    /* Look for the ELF magic number. */

    if (len > 4 && c[1] == 'E' && c[2] == 'L' && c[3] == 'F') {
	return 1;
    }

    /* Look for bytes which don't normally occur in ASCII or UTF-8
       text. */

    for (i = 0; i < max; i++) {
	if (c[i] < 0x20 && c[i] != '\n' && c[i] != '\t' && c[i] != '\r' &&
	    c[i] != '\f') {
#if 0 /* Uncomment this to get a printout of the bad byte. */
	    printf ("%d at byte %d\n", c[i], i);
#endif /* 0 */
	    return 1;
	}
    }
    return 0;
}

static void
check (int test, const char * message, ...)
{
    if (test) {
        va_list args;
        va_start (args, message);
        vfprintf (stderr, message, args);
        va_end (args);
        fprintf (stderr, "\n");
        exit (EXIT_FAILURE);
    }
}

/* See https://www.lemoda.net/c/mmap-example/index.html for a
   commented version of this code. */

void
mmap_file (const char * file_name,
	   unsigned char ** contents_ptr, size_t * size_ptr)
{
    int fd;
    struct stat s;
    int status;
    fd = open (file_name, O_RDONLY);
    check (fd < 0, "open %s failed: %s", file_name, strerror (errno));
    status = fstat (fd, & s);
    check (status < 0, "stat %s failed: %s", file_name, strerror (errno));
    * size_ptr = s.st_size;
    * contents_ptr = mmap (0, * size_ptr, PROT_READ, 0, fd, 0);
    check (* contents_ptr == MAP_FAILED, "mmap %s failed: %s",
           file_name, strerror (errno));
}

void test_file (const char * file_name)
{
    unsigned char * contents;
    size_t size;
    int status;
    mmap_file (file_name, & contents, & size);
    if (test_binary (contents, size)) {
	printf ("%s is binary.\n", file_name);
    }
    else {
	printf ("%s is not binary.\n", file_name);
    }
    status = munmap (contents, size);
    check (status != 0, "munmap %s failed", file_name);
}

int main (int argc, char ** argv)
{
    test_file (argv[0]);
    test_file ("dbf.c");
    return 0;
}

(download)

The output of the example looks like this:

./dbf is binary.
dbf.c is not binary.


Copyright © Ben Bullock 2009-2017. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com) or use the discussion group at Google Groups. / Privacy / Disclaimer