новые иконки в OpenBoard
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
OpenBoard/thirdparty/mera/pdf-merger/Parser.cpp

880 lines
27 KiB

#include <fstream>
#include <iostream>
#include <vector>
#include <map>
#include <stack>
#include <string.h>
#include "Parser.h"
#include "Object.h"
#include "Exception.h"
#include "Utils.h"
using namespace merge_lib;
using namespace std;
const std::string Parser::WHITESPACES(" \t\f\v\n\r");
const std::string Parser::DELIMETERS("()<>{}/%][");
const std::string Parser::NUMBERS("0123456789");
const std::string Parser::WHITESPACES_AND_DELIMETERS = Parser::WHITESPACES + Parser::DELIMETERS;
Document * Parser::parseDocument(const char * fileName)
{
_document = new Document(fileName);
try
{
_createObjectTree(fileName);
_createDocument(fileName);
}
catch( std::exception &)
{
_clearParser();
delete _document;
_document = NULL;
throw;
}
return _document;
}
void Parser::_retrieveAllPages(Object * objectWithKids)
{
std::string & objectContent = objectWithKids->getObjectContent();
unsigned int startOfKids = objectContent.find("/Kids");
unsigned int endOfKids = objectContent.find("]", startOfKids);
if(
(startOfKids == std::string::npos) &&
(objectContent.find("/Page") != std::string::npos)
)
{
unsigned int numberOfPages = _document->_pages.size() + 1;
Page * newPage = new Page(numberOfPages);
newPage->_root = objectWithKids;
_document->_pages.insert(std::pair<unsigned int, Page *>(numberOfPages, newPage));
return;
}
const std::vector<Object *> & kids = objectWithKids->getSortedByPositionChildren(startOfKids, endOfKids);
for(size_t i(0); i < kids.size(); ++i)
{
_retrieveAllPages(kids[i]);
}
}
void Parser::_createDocument(const char * docName)
{
_document->_root = _root;
Object * objectWithPages = 0;
std::string & rootContent = _root->getObjectContent();
unsigned int startOfPages = rootContent.find("/Pages");
if(startOfPages == std::string::npos)
throw Exception("Some document is wrong");
unsigned int endOfPages = rootContent.find("R", startOfPages);
std::vector<Object *> objectWithKids = _root->getChildrenByBounds(startOfPages, endOfPages);
if(objectWithKids.size() != 1)
throw Exception("Some document is wrong");
_retrieveAllPages(objectWithKids[0]);
_root->retrieveMaxObjectNumber(_document->_maxObjectNumber);
_clearParser();
}
void Parser::_clearParser()
{
_root = 0;
_fileContent.clear();
_fileContent.reserve();
_objects.clear();
}
void Parser::_getFileContent(const char * fileName)
{
ifstream pdfFile;
pdfFile.open (fileName, ios::binary );
if (pdfFile.fail())
{
stringstream errorMessage("File ");
errorMessage << fileName << " is absent" << "\0";
throw Exception(errorMessage);
}
// get length of file:
pdfFile.seekg (0, ios::end);
int length = pdfFile.tellg();
pdfFile.seekg (0, ios::beg);
_fileContent.resize(length);
pdfFile.read(&_fileContent[0], length);
// check version
const char *header = "%PDF-1.";
size_t verPos = _fileContent.find(header);
if( verPos == 0 )
{
verPos += strlen(header);
char ver = _fileContent[verPos];
if( ver < '0' || ver > '4' )
{
stringstream errorMsg;
errorMsg<<" File with verion 1."<<ver<<" is not currently supported by merge library\n";
throw Exception(errorMsg);
}
}
else
{
throw Exception("Unrecognized header of PDF file");
}
pdfFile.close();
}
void Parser::_createObjectTree(const char * fileName)
{
unsigned int rootObjectNumber = 0;
try
{
_getFileContent(fileName);
_readXRefAndCreateObjects();
rootObjectNumber = _readTrailerAndReturnRoot();
}
catch (std::exception &)
{
std::map<unsigned int, Object *>::const_iterator it(_objects.begin());
for(;it != _objects.end();it++)
{
delete (*it).second;
}
_objects.clear();
throw;
}
std::map<unsigned int, Object *>::iterator objectsIterator;
for ( objectsIterator = _objects.begin() ; objectsIterator != _objects.end(); objectsIterator++ )
{
Object * currentObject = (*objectsIterator).second;
_document->_allObjects.push_back(currentObject);
//key - object number : value - positions in object content of this reference
const std::map<unsigned int, Object::ReferencePositionsInContent> & refs =
_getReferences(currentObject->getObjectContent());
std::map<unsigned int, Object::ReferencePositionsInContent>::const_iterator refsIterator = refs.begin();
for(; refsIterator != refs.end(); ++refsIterator)
{
if(_objects.count((*refsIterator).first))
currentObject->addChild(_objects[(*refsIterator).first], (*refsIterator).second);
}
}
_root = _objects[rootObjectNumber];
}
const std::map<unsigned int, Object::ReferencePositionsInContent> & Parser::_getReferences(const std::string & objectContent)
{
unsigned int currentPosition(0), startOfNextSearch(0);
static std::map<unsigned int, std::vector<unsigned int> > searchResult;
searchResult.clear();
unsigned int streamStart = objectContent.find("stream");
if(streamStart == string::npos)
streamStart = objectContent.size();
while(startOfNextSearch < streamStart)
{
//try to find reference. reference example is 15 0 R
startOfNextSearch = objectContent.find(" R", startOfNextSearch);
currentPosition = startOfNextSearch;
if(currentPosition != std::string::npos)
{
//check that next character of " R" is WHITESPACE.
if((WHITESPACES.find(objectContent[currentPosition + 2]) == string::npos) &&
(DELIMETERS.find(objectContent[currentPosition + 2]) == string::npos)
)
{
//this is not reference. this is something looks like "0 0 0 RG"
++startOfNextSearch;
continue;
}
//get previos symbol and check that it is a number
unsigned int numberSearchCounter = _skipNumber(objectContent, --currentPosition);
//previos symbol is not a number
if(numberSearchCounter == currentPosition)
{
++startOfNextSearch;
continue;
}
else
{
currentPosition = numberSearchCounter;
}
bool isFound(false);
//previos symbols should be whitespaces
while((objectContent[currentPosition] == ' ') && --currentPosition)
{
isFound = true;
}
//previos symbol is not a whitespace
if(!isFound)
{
++startOfNextSearch;
continue;
}
//check that this and may be previos symbols are a numbers
numberSearchCounter = _skipNumber(objectContent, currentPosition);
if(numberSearchCounter == currentPosition)
{
++startOfNextSearch;
continue;
}
unsigned int objectNumber = Utils::stringToInt(objectContent.substr(numberSearchCounter + 1, currentPosition - numberSearchCounter));
searchResult[objectNumber].push_back(numberSearchCounter + 1);
++startOfNextSearch;
}
else
break;
}
return searchResult;
}
unsigned int Parser::_skipNumber(const std::string & str, unsigned int currentPosition)
{
unsigned int numberSearchCounter = currentPosition;
while((NUMBERS.find(str[numberSearchCounter]) != string::npos) && --numberSearchCounter)
{}
return numberSearchCounter;
}
void Parser::_readXRefAndCreateObjects()
{
unsigned int currentPostion = _getStartOfXrefWithRoot();
do
{
const std::string & currentToken = _getNextToken(currentPostion);
if(currentToken != "xref")
{
throw Exception("Wrong xref in some document");
}
unsigned int endOfLine = _getEndOfLineFromContent(currentPostion );
if(_countTokens(currentPostion, endOfLine) != 2)
{
throw Exception("Wrong xref in some document");
}
//now we are reading the xref
while(1)
{
unsigned int firstObjectNumber = Utils::stringToInt(_getNextToken(currentPostion));
unsigned int objectCount = Utils::stringToInt(_getNextToken(currentPostion));
for(unsigned int i(0); i < objectCount; i++)
{
unsigned long first;
unsigned long second;
if(_countTokens(currentPostion, _getEndOfLineFromContent(currentPostion)) == 3)
{
first = Utils::stringToInt(_getNextToken(currentPostion));
second = Utils::stringToInt(_getNextToken(currentPostion));
const string & use = _getNextToken(currentPostion);
if(!use.compare("n"))
{
unsigned int objectNumber;
try
{
std::pair<unsigned int, unsigned int> streamBounds;
bool hasObjectStream;
unsigned int generationNumber;
const std::string content = _getObjectContent(first, objectNumber, generationNumber, streamBounds, hasObjectStream);
if(!_objects.count(objectNumber))
{
Object * newObject = new Object(objectNumber, generationNumber, content, _document->_documentName ,streamBounds, hasObjectStream);
_objects[objectNumber] = newObject;
}
}
catch(std::exception &)
{
}
}
}
else
{
;
}
++currentPostion;
}
unsigned int previosPostion = currentPostion;
const std::string & isTrailer = _getNextToken(currentPostion);
std::string trailer("trailer");
if(isTrailer == trailer)
{
currentPostion -= trailer.size();
break;
}
else
currentPostion = previosPostion;
}
}
while(_readTrailerAndRterievePrev(currentPostion, currentPostion));
}
unsigned int Parser::_getStartOfXrefWithRoot()
{
unsigned int leftBoundOfStartOfXref = _fileContent.rfind("startxref");
leftBoundOfStartOfXref = _fileContent.find_first_of(NUMBERS, leftBoundOfStartOfXref);
unsigned int rightBoundOfStartOfXref = _fileContent.find_first_not_of(NUMBERS, leftBoundOfStartOfXref + 1);
std::string startOfXref = _fileContent.substr(leftBoundOfStartOfXref, rightBoundOfStartOfXref - leftBoundOfStartOfXref);
int integerStartOfXref = Utils::stringToInt(startOfXref);
return integerStartOfXref;
}
unsigned int Parser::_getEndOfLineFromContent(unsigned int fromPosition)
{
fromPosition = _skipWhiteSpacesFromContent(fromPosition);
unsigned int endOfLine = _fileContent.find_first_of("\n\r", fromPosition);
endOfLine = _fileContent.find_last_of("\n\r", endOfLine);
return endOfLine;
}
const std::pair<unsigned int, unsigned int> & Parser::_getLineBounds(const std::string & str, unsigned int fromPosition)
{
static std::pair<unsigned int, unsigned int> bounds;
bounds.first = str.rfind('\n', fromPosition);
if(bounds.first == string::npos)
bounds.first = 0;
bounds.second = str.find('\n', fromPosition);
if(bounds.second == string::npos)
bounds.second = str.size();
return bounds;
}
const std::string & Parser::_getNextToken(unsigned int & fromPosition)
{
fromPosition = _skipWhiteSpacesFromContent(fromPosition);
unsigned int position = _fileContent.find_first_of(WHITESPACES, fromPosition);
static std::string token;
if(position > fromPosition)
{
unsigned int tokenSize = position - fromPosition;
token.resize(tokenSize);
memcpy(&token[0], &_fileContent[fromPosition], tokenSize);
fromPosition = position;
return token;
}
else
{
//TODO throw exception
}
token = "";
return token;
}
unsigned int Parser::_countTokens(unsigned int leftBound, unsigned int rightBount)
{
unsigned int position = _skipWhiteSpacesFromContent(leftBound);
unsigned int tokensCount = 0;
while (position < rightBount)
{
position = _fileContent.find_first_of(WHITESPACES, position);
if (position != string::npos)
++tokensCount;
//start search from next symbol
++position;
}
return tokensCount;
}
unsigned int Parser::_skipWhiteSpaces(const std::string & str, unsigned int fromPosition)
{
unsigned int position = fromPosition;
if(WHITESPACES.find(str[0]) != string::npos)
position = str.find_first_not_of(WHITESPACES, position);
return position;
}
unsigned int Parser::_skipWhiteSpacesFromContent(unsigned int fromPosition)
{
unsigned int position = fromPosition;
if(WHITESPACES.find(_fileContent[position]) != string::npos)
position = _fileContent.find_first_not_of(WHITESPACES, position);// + 1;
return position;
}
const std::string & Parser::_getObjectContent(unsigned int objectPosition, unsigned int & objectNumber, unsigned int & generationNumber, std::pair<unsigned int, unsigned int> & streamBounds, bool & hasObjectStream)
{
hasObjectStream = false;
unsigned int currentPosition = objectPosition;
std::string token = _getNextToken(currentPosition); // number of object
objectNumber = Utils::stringToInt(token);
token = _getNextToken(currentPosition); // generation number - not interesting
generationNumber = Utils::stringToInt(token);
token = Parser::getNextToken(_fileContent,currentPosition);
if( token != "obj" )
{
std::stringstream strOut;
strOut<<"Wrong object in PDF, in position "<<currentPosition<<" cannot continue!\n";
throw Exception(strOut.str());
}
static std::string objectContent;
size_t contentStart = _fileContent.find_first_not_of(Parser::WHITESPACES,currentPosition);
if( contentStart == std::string::npos )
{
std::stringstream strOut;
strOut<<"Wrong object "<< objectNumber<< "in PDF, cannot find content for it\n";
throw Exception(strOut.str());
}
currentPosition = contentStart;
unsigned int endOfContent = _fileContent.find("endobj", contentStart);
if( endOfContent == std::string::npos )
{
stringstream errorMessage("Corrupted PDF file, obj does not have matching endobj");
throw Exception(errorMessage);
}
unsigned int endOfStream = _fileContent.find("endstream", currentPosition);
if((endOfStream != std::string::npos) && (endOfStream < endOfContent))
{
std::string stream("stream");
unsigned int beginOfStream = _fileContent.find(stream, currentPosition) + stream.size();
while(_fileContent[beginOfStream] == '\r')
{
++beginOfStream;
}
if( _fileContent[beginOfStream] == '\n')
{
++beginOfStream;
}
streamBounds.first = beginOfStream;
// try to use Length field to determine end of stream.
std::string lengthToken = "/Length";
size_t lengthBegin = Parser::findTokenName(_fileContent,lengthToken,contentStart);
if ( lengthBegin != std::string::npos )
{
std::string lengthStr;
size_t lenPos = lengthBegin + lengthToken.size();
bool useContentLength = false;
if( Parser::getNextWord(lengthStr,_fileContent,lenPos) )
{
useContentLength = true;
std::string refStr;
if( Parser::getNextWord(refStr,_fileContent,lenPos))
{
if( Parser::getNextWord(refStr,_fileContent,lenPos))
{
if( refStr == "R" )
{
useContentLength = false;
//it is reference
}
}
}
}
if( useContentLength )
{
std::stringstream strin(lengthStr);
unsigned int streamEnd = 0;
strin>>streamEnd;
streamEnd += beginOfStream;
unsigned int streamEndBegin = _fileContent.find("endstream",streamEnd);
if( streamEndBegin != std::string::npos )
{
endOfStream = streamEndBegin;
}
}
}
streamBounds.second = endOfStream;
endOfContent = beginOfStream;
hasObjectStream = true;
}
unsigned int contentSize = endOfContent - currentPosition;
objectContent.resize(contentSize);
memcpy(&objectContent[0], &_fileContent[currentPosition], contentSize);
return objectContent;
}
unsigned int Parser::_readTrailerAndReturnRoot()
{
unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", _getStartOfXrefWithRoot());
std::string rootStr("/Root");
unsigned int startOfRoot = Parser::findToken(_fileContent,rootStr.data(), startOfTrailer);
if( startOfRoot == std::string::npos)
{
throw Exception("Cannot find Root object !");
}
std::string encryptStr("/Encrypt");
if( Parser::findToken(_fileContent,encryptStr,startOfTrailer) != std::string::npos )
{
throw Exception("Encrypted PDF is not supported!");
}
startOfRoot += rootStr.size()+1; //"/Root + ' '
unsigned int endOfRoot = startOfRoot;
while(NUMBERS.find(_fileContent[endOfRoot++]) != string::npos)
{}
--endOfRoot;
return Utils::stringToInt(_fileContent.substr(startOfRoot, endOfRoot - startOfRoot));
}
unsigned int Parser::_readTrailerAndRterievePrev(const unsigned int startPositionForSearch, unsigned int & previosXref)
{
unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", startPositionForSearch);
if( startOfTrailer == std::string::npos )
{
throw Exception("Cannot find trailer!");
}
unsigned int startOfPrev = _fileContent.find("Prev ", startOfTrailer);
unsigned int startxref = _fileContent.find("startxref", startOfTrailer);
if(startOfPrev == string::npos || (startOfPrev > startxref))
return false;
//"Prev "s length = 5
else
startOfPrev += 5;
unsigned int endOfPrev = startOfPrev;
while(NUMBERS.find(_fileContent[endOfPrev++]) != string::npos)
{}
--endOfPrev;
previosXref = Utils::stringToInt(_fileContent.substr(startOfPrev, endOfPrev - startOfPrev));
return true;
}
//Method finds the token from current position from string
// It uses PDF whitespaces and delimeters to recognize
// Returned string without begin/end spaces
std::string Parser::getNextToken(const std::string &str, unsigned int &position)
{
if( position >= str.size() )
{
return "";
}
//skip first spaces
size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,position);
if ( beg_pos == std::string::npos )
{
// it is empty string!
return "";
}
size_t end_pos = str.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,beg_pos);
if ( end_pos == std::string::npos )
{
end_pos = str.size();
}
position = end_pos;
std::string out = str.substr(beg_pos,end_pos - beg_pos);
Parser::trim(out);
return out;
}
/** @brief getNextWord
*
* method finds and returns next word from the string
* For example: " 1 0 R \n" will return "1" , then "0" then "R"
*/
bool Parser::getNextWord(std::string &out, const std::string &str, size_t &nextPosition, size_t *found)
{
if( found )
{
*found = std::string::npos;
}
//trace("position = %d",position);
if( nextPosition >= str.size() )
{
return false;
}
//skip first spaces
size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,nextPosition);
if ( beg_pos == std::string::npos )
{
// it is empty string!
return false;
}
if( found )
{
*found = beg_pos;
}
size_t end_pos = str.find_first_of(Parser::WHITESPACES,beg_pos);
if ( end_pos == std::string::npos )
{
end_pos = str.size();
}
nextPosition = end_pos;
out = str.substr(beg_pos,end_pos - beg_pos);
Parser::trim(out);
if( out.empty() )
{
return false;
}
return true;
}
/** @brief trim
*
* @todo: document this function
*/
void Parser::trim(std::string &str)
{
std::string::size_type pos1 = str.find_first_not_of(WHITESPACES);
std::string::size_type pos2 = str.find_last_not_of(WHITESPACES);
str = str.substr(pos1 == std::string::npos ? 0 : pos1,
pos2 == std::string::npos ? str.length() - 1 : pos2 - pos1 + 1);
}
// Method tries to find the PDF token from the content
// The token is "/L 12 0R" or /Length 123
std::string Parser::findTokenStr(const std::string &content, const std::string &pattern, size_t start, size_t &foundStart, size_t &foundEnd)
{
size_t cur_pos = Parser::findToken(content,pattern,start);
if( cur_pos == std::string::npos )
{
return "";
}
foundStart = cur_pos;
cur_pos += pattern.size();
// then lets parse the content of remaining part
size_t end_pos = content.find_first_of(Parser::DELIMETERS,cur_pos);
if( end_pos == std::string::npos )
{
end_pos = content.size();
}
std::string token = content.substr(cur_pos,end_pos-cur_pos);
foundEnd = end_pos -1;
return token;
}
// Method tries to find token in the string from specified position,
// returns position of first occurent or npos if not found
// It properly handles cases when content contains strings which
// contains token but not euqal to it
// Example: content "/Transparency/ ..." pattern "/Trans
// will return npos.
size_t Parser::findToken(const std::string &content, const std::string &keyword,size_t start)
{
size_t cur_pos = start;
// lets find pattern first
size_t foundStart = std::string::npos;
size_t savedPos = 0;
while( 1 )
{
cur_pos = content.find(keyword,cur_pos);
if( cur_pos == std::string::npos )
{
break;
}
savedPos = cur_pos;
cur_pos += keyword.size();
if( cur_pos < content.size() )
{
if( Parser::WHITESPACES.find(content[cur_pos]) != std::string::npos ||
Parser::DELIMETERS.find(content[cur_pos]) != std::string::npos )
{
foundStart = savedPos;
break;
}
}
else
{
foundStart = savedPos;
// end of line is reached
break;
}
}
return foundStart;
}
// Method checks if token at current position can be a Name or it is not name but value
// Example
// /H /P /P 12 0 R
// the tag /P can be a name (and a value also), while 12 cannot
// start defines the position of token content
bool Parser::tokenIsAName(const std::string &content, size_t start )
{
std::string openBraces = "<[({";
bool found = false;
while(1)
{
size_t foundNonWhite = content.find_first_not_of(Parser::WHITESPACES,start);
size_t foundDelim = content.find_first_of(Parser::DELIMETERS,start);
if( foundNonWhite != std::string::npos &&
foundDelim != std::string::npos )
{
if( (foundNonWhite < foundDelim ) || ( openBraces.find(content[foundDelim]) != std::string::npos) )
{
if( found )
{
return false;
}
else
{
return true;
}
}
else
{
if( found )
{
return true;
}
else
{
found = true;
start = content.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,foundDelim+1);
}
}
}
else
{
return true;
}
}
}
// Method tries to find token name in the string from specified position,
// For example, the string contains /H /P /P 12 0 R.
// If search for /P then it will return position of /P 12 0 R, not value of
// /H /P
size_t Parser::findTokenName(const std::string &content, const std::string &keyword,size_t start)
{
size_t cur_pos = start;
// lets find pattern first
size_t foundStart = std::string::npos;
size_t savedPos = 0;
std::string braces = "<[({";
while( 1 )
{
cur_pos = content.find(keyword,cur_pos);
if( cur_pos == std::string::npos )
{
break;
}
savedPos = cur_pos;
cur_pos += keyword.size();
if( cur_pos < content.size() )
{
if( Parser::WHITESPACES_AND_DELIMETERS.find(content[cur_pos]) != std::string::npos )
{
if( tokenIsAName(content,cur_pos ) )
{
foundStart = savedPos;
break;
}
}
}
else
{
foundStart = savedPos;
// end of line is reached
break;
}
}
return foundStart;
}
unsigned int Parser::findEndOfElementContent(const std::string &content,unsigned int startOfPageElement)
{
unsigned int foundEnd = std::string::npos;
std::stack<std::string> delimStack;
std::string endDelim = "/]>)}";
unsigned int curPos = startOfPageElement;
std::string openDict("<");
std::string openArray("[");
std::string delimeter = endDelim;
delimStack.push(delimeter); //initial delimeter
bool compensation = true;
while(1)
{
unsigned int nonWhiteSpace = content.find_first_not_of(Parser::WHITESPACES,curPos);
unsigned int foundDelimeter = content.find_first_of(delimeter,curPos);
unsigned int foundOpenBrace = content.find("[",curPos);
unsigned int foundOpenDict = content.find("<",curPos);
if( foundDelimeter == std::string::npos && foundOpenBrace == std::string::npos && foundOpenDict == std::string::npos )
{
if( !delimStack.empty() )
{
delimStack.pop();
}
}
else if( (foundDelimeter <= foundOpenBrace && foundDelimeter <= foundOpenDict ) )
{
if( !delimStack.empty() )
{
delimStack.pop();
}
if( nonWhiteSpace == foundDelimeter && delimeter == endDelim )
{
curPos = foundDelimeter;
if(content[foundDelimeter] == '/' && compensation )
{
curPos ++;
compensation = false;
}
}
else
{
compensation = false;
if( delimeter == endDelim )
{
curPos = foundDelimeter;
}
else
{
curPos = foundDelimeter + delimeter.size();
}
}
}
else if( foundOpenBrace <= foundDelimeter && foundOpenBrace <= foundOpenDict )
{
compensation = false;
delimStack.push("]");
curPos = foundOpenBrace + openArray.size();
}
else if( foundOpenDict <= foundDelimeter && foundOpenDict <= foundOpenBrace )
{
compensation = false;
delimStack.push(">");
curPos = foundOpenDict + openDict.size();
}
if( delimStack.empty() )
{
foundEnd = content.find_first_of(delimeter,curPos);
if( foundEnd == std::string::npos )
{
foundEnd = curPos;
}
break;
}
delimeter = delimStack.top();
}
return foundEnd;
}