/*
     This file is part of libextractor.
     (C) 2002, 2003 Vidyut Samanta and Christian Grothoff

     libextractor is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published
     by the Free Software Foundation; either version 2, or (at your
     option) any later version.

     libextractor is distributed in the hope that it will be useful, but
     WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with libextractor; see the file COPYING.  If not, write to the
     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     Boston, MA 02111-1307, USA.

     This code was inspired by pdfinfo and depends heavily
     on the xpdf code that pdfinfo is a part of. See also
     the INFO file in this directory.
 **/


#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <math.h>
#include "parseargs.h"
#include "GString.h"
#include "gmem.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "Params.h"
#include "Error.h"
#include "config.h"

#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#ifndef MINGW
#include <netinet/in.h>
#include <sys/mman.h>
#endif

extern "C" {

#include "extractor.h"

  static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,
						char * keyword,
						struct EXTRACTOR_Keywords * next) {
    EXTRACTOR_KeywordList * result;
    
    if (keyword == NULL)
      return next;
    result = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList));
    result->next = next;    
    result->keyword = keyword;
    result->keywordType = type;
    return result;
  }
  
  
  static struct EXTRACTOR_Keywords * printInfoString(Dict *infoDict, 
						     char *key, 
						     EXTRACTOR_KeywordType type,
						     struct EXTRACTOR_Keywords * next) {
    Object obj;
    GString *s1, *s2;
    int i;
    
    if (infoDict->lookup(key, &obj)->isString()) {
      s1 = obj.getString();
      if ((s1->getChar(0) & 0xff) == 0xfe &&
	  (s1->getChar(1) & 0xff) == 0xff) {
	s2 = new GString();
	for (i = 2; i < obj.getString()->getLength(); i += 2) {
	  if (s1->getChar(i) == '\0') {
	    s2->append(s1->getChar(i+1));
	  } else {
	    delete s2;
	    s2 = new GString("<unicode>");
	    break;
	  }
	}
	next = addKeyword(type, strdup(s2->getCString()), next);
	/* printf(fmt, s2->getCString());*/
	delete s2;
      } else {
	next = addKeyword(type, strdup(s1->getCString()), next);
	/* printf(fmt, s1->getCString());*/
      }
    }
    obj.free();
    return next;
  }
  
  static struct EXTRACTOR_Keywords * printInfoDate(Dict *infoDict, 
						   char *key, 
						   EXTRACTOR_KeywordType type,
						   struct EXTRACTOR_Keywords * next) {
    Object obj;
    char *s;
    
    if (infoDict->lookup(key, &obj)->isString()) {
      s = obj.getString()->getCString();
      if (s[0] == 'D' && s[1] == ':') {
	s += 2;
      }
      next = addKeyword(type, strdup(s), next);
      /* printf(fmt, s);*/
    }
    obj.free();
    return next;
  }
  
  struct EXTRACTOR_Keywords * libextractor_pdf_extract(char * filename,
                                                       unsigned char * data,
                                                       size_t size,
                                                       struct EXTRACTOR_Keywords * prev) {
    PDFDoc * doc;
    GString * fileName;
    Object info;
    struct EXTRACTOR_Keywords * result;
    
    fileName = new GString(filename);
    /* errorInit();   -- keep commented out, otherwise errors are printed to stderr for non-pdf files! */
    initParams(".xpdfrc", ".xpdfrc");
    doc = new PDFDoc(fileName, NULL, NULL);
    if (! doc->isOk()) {
      delete doc;
      freeParams();
      return prev;
    }
    
    result = addKeyword(EXTRACTOR_MIMETYPE,
			strdup("application/pdf"),
			prev);
    doc->getDocInfo(&info);
    if (info.isDict()) {
      result = printInfoString(info.getDict(), 
			       "Title", 
			       EXTRACTOR_TITLE,
			       result);
      result = printInfoString(info.getDict(), 
			       "Subject",      
			       EXTRACTOR_SUBJECT,
			       result);
      result = printInfoString(info.getDict(), 
			       "Keywords",
			       EXTRACTOR_KEYWORDS,
			       result);
      result = printInfoString(info.getDict(), 
			       "Author",
			       EXTRACTOR_AUTHOR,
			       result);
      result = printInfoString(info.getDict(), 
			       "Creator",
			       EXTRACTOR_CREATOR,
			       result);
      result = printInfoString(info.getDict(), 
			       "Producer",     
			       EXTRACTOR_PRODUCER,
			       result);
      {
	char pcnt[20];
	sprintf(pcnt, "%d", doc->getNumPages());
	result = addKeyword(EXTRACTOR_PAGE_COUNT,
			    strdup(pcnt),
			    result);
      }
      result = printInfoDate(info.getDict(),   
			     "CreationDate", 
			     EXTRACTOR_CREATION_DATE,
			     result);
      result = printInfoDate(info.getDict(),   
			     "ModDate",
			     EXTRACTOR_MODIFICATION_DATE,
			     result);
    }
    info.free();
    delete doc;
    freeParams();
    
    return result;  
  }
}

#define HAVE_MAIN 0
#if HAVE_MAIN
int main (int argc, char **argv) {
  int file;
  char * buffer;
  struct stat fstatbuf;
  size_t size;

  if (argc != 2) {
    fprintf(stderr, 
	    "Call with filename as argument\n");
    return -1;
  }  
  file = OPEN(argv[1],O_RDONLY);
  if (-1 == file) 
    return -1;
  if (-1 == fstat(file, &fstatbuf)) {
    close(file);
    return -1;
  }
  size = fstatbuf.st_size;
  buffer = mmap(NULL, size, PROT_READ, MAP_SHARED, file, 0);  
  close(file);

  EXTRACTOR_printKeywords(stdout,
			  libextractor_pdf_extract(argv[1], 
						   buffer,
						   size,
						   NULL));
  munmap(buffer,size);
  return 0;
}
#endif
