#include #include #include #include #include #include #include "Parser.h" #include "Object.h" #include "Exception.h" #include "Utils.h" using namespace merge_lib; using namespace std; const std::string Parser::WHITESPACES(" \t\f\v\n\r"); const std::string Parser::DELIMETERS("()<>{}/%]["); const std::string Parser::NUMBERS("0123456789"); const std::string Parser::WHITESPACES_AND_DELIMETERS = Parser::WHITESPACES + Parser::DELIMETERS; Document * Parser::parseDocument(const char * fileName) { _document = new Document(fileName); try { _createObjectTree(fileName); _createDocument(fileName); } catch( std::exception &) { _clearParser(); delete _document; _document = NULL; throw; } return _document; } void Parser::_retrieveAllPages(Object * objectWithKids) { std::string & objectContent = objectWithKids->getObjectContent(); unsigned int startOfKids = objectContent.find("/Kids"); unsigned int endOfKids = objectContent.find("]", startOfKids); if( (startOfKids == std::string::npos) && (objectContent.find("/Page") != std::string::npos) ) { unsigned int numberOfPages = _document->_pages.size() + 1; Page * newPage = new Page(numberOfPages); newPage->_root = objectWithKids; _document->_pages.insert(std::pair(numberOfPages, newPage)); return; } const std::vector & kids = objectWithKids->getSortedByPositionChildren(startOfKids, endOfKids); for(size_t i(0); i < kids.size(); ++i) { _retrieveAllPages(kids[i]); } } void Parser::_createDocument(const char * docName) { _document->_root = _root; Object * objectWithPages = 0; std::string & rootContent = _root->getObjectContent(); unsigned int startOfPages = rootContent.find("/Pages"); if(startOfPages == std::string::npos) throw Exception("Some document is wrong"); unsigned int endOfPages = rootContent.find("R", startOfPages); std::vector objectWithKids = _root->getChildrenByBounds(startOfPages, endOfPages); if(objectWithKids.size() != 1) throw Exception("Some document is wrong"); _retrieveAllPages(objectWithKids[0]); _root->retrieveMaxObjectNumber(_document->_maxObjectNumber); _clearParser(); } void Parser::_clearParser() { _root = 0; _fileContent.clear(); _fileContent.reserve(); _objects.clear(); } void Parser::_getFileContent(const char * fileName) { ifstream pdfFile; pdfFile.open (fileName, ios::binary ); if (pdfFile.fail()) { stringstream errorMessage("File "); errorMessage << fileName << " is absent" << "\0"; throw Exception(errorMessage); } // get length of file: pdfFile.seekg (0, ios::end); int length = pdfFile.tellg(); pdfFile.seekg (0, ios::beg); _fileContent.resize(length); pdfFile.read(&_fileContent[0], length); // check version const char *header = "%PDF-1."; size_t verPos = _fileContent.find(header); if( verPos == 0 ) { verPos += strlen(header); char ver = _fileContent[verPos]; if( ver < '0' || ver > '4' ) { stringstream errorMsg; errorMsg<<" File with verion 1."<::const_iterator it(_objects.begin()); for(;it != _objects.end();it++) { delete (*it).second; } _objects.clear(); throw; } std::map::iterator objectsIterator; for ( objectsIterator = _objects.begin() ; objectsIterator != _objects.end(); objectsIterator++ ) { Object * currentObject = (*objectsIterator).second; _document->_allObjects.push_back(currentObject); //key - object number : value - positions in object content of this reference const std::map & refs = _getReferences(currentObject->getObjectContent()); std::map::const_iterator refsIterator = refs.begin(); for(; refsIterator != refs.end(); ++refsIterator) { if(_objects.count((*refsIterator).first)) currentObject->addChild(_objects[(*refsIterator).first], (*refsIterator).second); } } _root = _objects[rootObjectNumber]; } const std::map & Parser::_getReferences(const std::string & objectContent) { unsigned int currentPosition(0), startOfNextSearch(0); static std::map > searchResult; searchResult.clear(); unsigned int streamStart = objectContent.find("stream"); if(streamStart == string::npos) streamStart = objectContent.size(); while(startOfNextSearch < streamStart) { //try to find reference. reference example is 15 0 R startOfNextSearch = objectContent.find(" R", startOfNextSearch); currentPosition = startOfNextSearch; if(currentPosition != std::string::npos) { //check that next character of " R" is WHITESPACE. if((WHITESPACES.find(objectContent[currentPosition + 2]) == string::npos) && (DELIMETERS.find(objectContent[currentPosition + 2]) == string::npos) ) { //this is not reference. this is something looks like "0 0 0 RG" ++startOfNextSearch; continue; } //get previos symbol and check that it is a number unsigned int numberSearchCounter = _skipNumber(objectContent, --currentPosition); //previos symbol is not a number if(numberSearchCounter == currentPosition) { ++startOfNextSearch; continue; } else { currentPosition = numberSearchCounter; } bool isFound(false); //previos symbols should be whitespaces while((objectContent[currentPosition] == ' ') && --currentPosition) { isFound = true; } //previos symbol is not a whitespace if(!isFound) { ++startOfNextSearch; continue; } //check that this and may be previos symbols are a numbers numberSearchCounter = _skipNumber(objectContent, currentPosition); if(numberSearchCounter == currentPosition) { ++startOfNextSearch; continue; } unsigned int objectNumber = Utils::stringToInt(objectContent.substr(numberSearchCounter + 1, currentPosition - numberSearchCounter)); searchResult[objectNumber].push_back(numberSearchCounter + 1); ++startOfNextSearch; } else break; } return searchResult; } unsigned int Parser::_skipNumber(const std::string & str, unsigned int currentPosition) { unsigned int numberSearchCounter = currentPosition; while((NUMBERS.find(str[numberSearchCounter]) != string::npos) && --numberSearchCounter) {} return numberSearchCounter; } void Parser::_readXRefAndCreateObjects() { unsigned int currentPostion = _getStartOfXrefWithRoot(); do { const std::string & currentToken = _getNextToken(currentPostion); if(currentToken != "xref") { throw Exception("Wrong xref in some document"); } unsigned int endOfLine = _getEndOfLineFromContent(currentPostion ); if(_countTokens(currentPostion, endOfLine) != 2) { throw Exception("Wrong xref in some document"); } //now we are reading the xref while(1) { unsigned int firstObjectNumber = Utils::stringToInt(_getNextToken(currentPostion)); unsigned int objectCount = Utils::stringToInt(_getNextToken(currentPostion)); for(unsigned int i(0); i < objectCount; i++) { unsigned long first; unsigned long second; if(_countTokens(currentPostion, _getEndOfLineFromContent(currentPostion)) == 3) { first = Utils::stringToInt(_getNextToken(currentPostion)); second = Utils::stringToInt(_getNextToken(currentPostion)); const string & use = _getNextToken(currentPostion); if(!use.compare("n")) { unsigned int objectNumber; try { std::pair streamBounds; bool hasObjectStream; unsigned int generationNumber; const std::string content = _getObjectContent(first, objectNumber, generationNumber, streamBounds, hasObjectStream); if(!_objects.count(objectNumber)) { Object * newObject = new Object(objectNumber, generationNumber, content, _document->_documentName ,streamBounds, hasObjectStream); _objects[objectNumber] = newObject; } } catch(std::exception &) { } } } else { ; } ++currentPostion; } unsigned int previosPostion = currentPostion; const std::string & isTrailer = _getNextToken(currentPostion); std::string trailer("trailer"); if(isTrailer == trailer) { currentPostion -= trailer.size(); break; } else currentPostion = previosPostion; } } while(_readTrailerAndRterievePrev(currentPostion, currentPostion)); } unsigned int Parser::_getStartOfXrefWithRoot() { unsigned int leftBoundOfStartOfXref = _fileContent.rfind("startxref"); leftBoundOfStartOfXref = _fileContent.find_first_of(NUMBERS, leftBoundOfStartOfXref); unsigned int rightBoundOfStartOfXref = _fileContent.find_first_not_of(NUMBERS, leftBoundOfStartOfXref + 1); std::string startOfXref = _fileContent.substr(leftBoundOfStartOfXref, rightBoundOfStartOfXref - leftBoundOfStartOfXref); int integerStartOfXref = Utils::stringToInt(startOfXref); return integerStartOfXref; } unsigned int Parser::_getEndOfLineFromContent(unsigned int fromPosition) { fromPosition = _skipWhiteSpacesFromContent(fromPosition); unsigned int endOfLine = _fileContent.find_first_of("\n\r", fromPosition); endOfLine = _fileContent.find_last_of("\n\r", endOfLine); return endOfLine; } const std::pair & Parser::_getLineBounds(const std::string & str, unsigned int fromPosition) { static std::pair bounds; bounds.first = str.rfind('\n', fromPosition); if(bounds.first == string::npos) bounds.first = 0; bounds.second = str.find('\n', fromPosition); if(bounds.second == string::npos) bounds.second = str.size(); return bounds; } const std::string & Parser::_getNextToken(unsigned int & fromPosition) { fromPosition = _skipWhiteSpacesFromContent(fromPosition); unsigned int position = _fileContent.find_first_of(WHITESPACES, fromPosition); static std::string token; if(position > fromPosition) { unsigned int tokenSize = position - fromPosition; token.resize(tokenSize); memcpy(&token[0], &_fileContent[fromPosition], tokenSize); fromPosition = position; return token; } else { //TODO throw exception } token = ""; return token; } unsigned int Parser::_countTokens(unsigned int leftBound, unsigned int rightBount) { unsigned int position = _skipWhiteSpacesFromContent(leftBound); unsigned int tokensCount = 0; while (position < rightBount) { position = _fileContent.find_first_of(WHITESPACES, position); if (position != string::npos) ++tokensCount; //start search from next symbol ++position; } return tokensCount; } unsigned int Parser::_skipWhiteSpaces(const std::string & str, unsigned int fromPosition) { unsigned int position = fromPosition; if(WHITESPACES.find(str[0]) != string::npos) position = str.find_first_not_of(WHITESPACES, position); return position; } unsigned int Parser::_skipWhiteSpacesFromContent(unsigned int fromPosition) { unsigned int position = fromPosition; if(WHITESPACES.find(_fileContent[position]) != string::npos) position = _fileContent.find_first_not_of(WHITESPACES, position);// + 1; return position; } const std::string & Parser::_getObjectContent(unsigned int objectPosition, unsigned int & objectNumber, unsigned int & generationNumber, std::pair & streamBounds, bool & hasObjectStream) { hasObjectStream = false; unsigned int currentPosition = objectPosition; std::string token = _getNextToken(currentPosition); // number of object objectNumber = Utils::stringToInt(token); token = _getNextToken(currentPosition); // generation number - not interesting generationNumber = Utils::stringToInt(token); token = Parser::getNextToken(_fileContent,currentPosition); if( token != "obj" ) { std::stringstream strOut; strOut<<"Wrong object in PDF, in position "<>streamEnd; streamEnd += beginOfStream; unsigned int streamEndBegin = _fileContent.find("endstream",streamEnd); if( streamEndBegin != std::string::npos ) { endOfStream = streamEndBegin; } } } streamBounds.second = endOfStream; endOfContent = beginOfStream; hasObjectStream = true; } unsigned int contentSize = endOfContent - currentPosition; objectContent.resize(contentSize); memcpy(&objectContent[0], &_fileContent[currentPosition], contentSize); return objectContent; } unsigned int Parser::_readTrailerAndReturnRoot() { unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", _getStartOfXrefWithRoot()); std::string rootStr("/Root"); unsigned int startOfRoot = Parser::findToken(_fileContent,rootStr.data(), startOfTrailer); if( startOfRoot == std::string::npos) { throw Exception("Cannot find Root object !"); } std::string encryptStr("/Encrypt"); if( Parser::findToken(_fileContent,encryptStr,startOfTrailer) != std::string::npos ) { throw Exception("Encrypted PDF is not supported!"); } startOfRoot += rootStr.size()+1; //"/Root + ' ' unsigned int endOfRoot = startOfRoot; while(NUMBERS.find(_fileContent[endOfRoot++]) != string::npos) {} --endOfRoot; return Utils::stringToInt(_fileContent.substr(startOfRoot, endOfRoot - startOfRoot)); } unsigned int Parser::_readTrailerAndRterievePrev(const unsigned int startPositionForSearch, unsigned int & previosXref) { unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", startPositionForSearch); if( startOfTrailer == std::string::npos ) { throw Exception("Cannot find trailer!"); } unsigned int startOfPrev = _fileContent.find("Prev ", startOfTrailer); unsigned int startxref = _fileContent.find("startxref", startOfTrailer); if(startOfPrev == string::npos || (startOfPrev > startxref)) return false; //"Prev "s length = 5 else startOfPrev += 5; unsigned int endOfPrev = startOfPrev; while(NUMBERS.find(_fileContent[endOfPrev++]) != string::npos) {} --endOfPrev; previosXref = Utils::stringToInt(_fileContent.substr(startOfPrev, endOfPrev - startOfPrev)); return true; } //Method finds the token from current position from string // It uses PDF whitespaces and delimeters to recognize // Returned string without begin/end spaces std::string Parser::getNextToken(const std::string &str, unsigned int &position) { if( position >= str.size() ) { return ""; } //skip first spaces size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,position); if ( beg_pos == std::string::npos ) { // it is empty string! return ""; } size_t end_pos = str.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,beg_pos); if ( end_pos == std::string::npos ) { end_pos = str.size(); } position = end_pos; std::string out = str.substr(beg_pos,end_pos - beg_pos); Parser::trim(out); return out; } /** @brief getNextWord * * method finds and returns next word from the string * For example: " 1 0 R \n" will return "1" , then "0" then "R" */ bool Parser::getNextWord(std::string &out, const std::string &str, size_t &nextPosition, size_t *found) { if( found ) { *found = std::string::npos; } //trace("position = %d",position); if( nextPosition >= str.size() ) { return false; } //skip first spaces size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,nextPosition); if ( beg_pos == std::string::npos ) { // it is empty string! return false; } if( found ) { *found = beg_pos; } size_t end_pos = str.find_first_of(Parser::WHITESPACES,beg_pos); if ( end_pos == std::string::npos ) { end_pos = str.size(); } nextPosition = end_pos; out = str.substr(beg_pos,end_pos - beg_pos); Parser::trim(out); if( out.empty() ) { return false; } return true; } /** @brief trim * * @todo: document this function */ void Parser::trim(std::string &str) { std::string::size_type pos1 = str.find_first_not_of(WHITESPACES); std::string::size_type pos2 = str.find_last_not_of(WHITESPACES); str = str.substr(pos1 == std::string::npos ? 0 : pos1, pos2 == std::string::npos ? str.length() - 1 : pos2 - pos1 + 1); } // Method tries to find the PDF token from the content // The token is "/L 12 0R" or /Length 123 std::string Parser::findTokenStr(const std::string &content, const std::string &pattern, size_t start, size_t &foundStart, size_t &foundEnd) { size_t cur_pos = Parser::findToken(content,pattern,start); if( cur_pos == std::string::npos ) { return ""; } foundStart = cur_pos; cur_pos += pattern.size(); // then lets parse the content of remaining part size_t end_pos = content.find_first_of(Parser::DELIMETERS,cur_pos); if( end_pos == std::string::npos ) { end_pos = content.size(); } std::string token = content.substr(cur_pos,end_pos-cur_pos); foundEnd = end_pos -1; return token; } // Method tries to find token in the string from specified position, // returns position of first occurent or npos if not found // It properly handles cases when content contains strings which // contains token but not euqal to it // Example: content "/Transparency/ ..." pattern "/Trans // will return npos. size_t Parser::findToken(const std::string &content, const std::string &keyword,size_t start) { size_t cur_pos = start; // lets find pattern first size_t foundStart = std::string::npos; size_t savedPos = 0; while( 1 ) { cur_pos = content.find(keyword,cur_pos); if( cur_pos == std::string::npos ) { break; } savedPos = cur_pos; cur_pos += keyword.size(); if( cur_pos < content.size() ) { if( Parser::WHITESPACES.find(content[cur_pos]) != std::string::npos || Parser::DELIMETERS.find(content[cur_pos]) != std::string::npos ) { foundStart = savedPos; break; } } else { foundStart = savedPos; // end of line is reached break; } } return foundStart; } // Method checks if token at current position can be a Name or it is not name but value // Example // /H /P /P 12 0 R // the tag /P can be a name (and a value also), while 12 cannot // start defines the position of token content bool Parser::tokenIsAName(const std::string &content, size_t start ) { std::string openBraces = "<[({"; bool found = false; while(1) { size_t foundNonWhite = content.find_first_not_of(Parser::WHITESPACES,start); size_t foundDelim = content.find_first_of(Parser::DELIMETERS,start); if( foundNonWhite != std::string::npos && foundDelim != std::string::npos ) { if( (foundNonWhite < foundDelim ) || ( openBraces.find(content[foundDelim]) != std::string::npos) ) { if( found ) { return false; } else { return true; } } else { if( found ) { return true; } else { found = true; start = content.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,foundDelim+1); } } } else { return true; } } } // Method tries to find token name in the string from specified position, // For example, the string contains /H /P /P 12 0 R. // If search for /P then it will return position of /P 12 0 R, not value of // /H /P size_t Parser::findTokenName(const std::string &content, const std::string &keyword,size_t start) { size_t cur_pos = start; // lets find pattern first size_t foundStart = std::string::npos; size_t savedPos = 0; std::string braces = "<[({"; while( 1 ) { cur_pos = content.find(keyword,cur_pos); if( cur_pos == std::string::npos ) { break; } savedPos = cur_pos; cur_pos += keyword.size(); if( cur_pos < content.size() ) { if( Parser::WHITESPACES_AND_DELIMETERS.find(content[cur_pos]) != std::string::npos ) { if( tokenIsAName(content,cur_pos ) ) { foundStart = savedPos; break; } } } else { foundStart = savedPos; // end of line is reached break; } } return foundStart; } unsigned int Parser::findEndOfElementContent(const std::string &content,unsigned int startOfPageElement) { unsigned int foundEnd = std::string::npos; std::stack delimStack; std::string endDelim = "/]>)}"; unsigned int curPos = startOfPageElement; std::string openDict("<"); std::string openArray("["); std::string delimeter = endDelim; delimStack.push(delimeter); //initial delimeter bool compensation = true; while(1) { unsigned int nonWhiteSpace = content.find_first_not_of(Parser::WHITESPACES,curPos); unsigned int foundDelimeter = content.find_first_of(delimeter,curPos); unsigned int foundOpenBrace = content.find("[",curPos); unsigned int foundOpenDict = content.find("<",curPos); if( foundDelimeter == std::string::npos && foundOpenBrace == std::string::npos && foundOpenDict == std::string::npos ) { if( !delimStack.empty() ) { delimStack.pop(); } } else if( (foundDelimeter <= foundOpenBrace && foundDelimeter <= foundOpenDict ) ) { if( !delimStack.empty() ) { delimStack.pop(); } if( nonWhiteSpace == foundDelimeter && delimeter == endDelim ) { curPos = foundDelimeter; if(content[foundDelimeter] == '/' && compensation ) { curPos ++; compensation = false; } } else { compensation = false; if( delimeter == endDelim ) { curPos = foundDelimeter; } else { curPos = foundDelimeter + delimeter.size(); } } } else if( foundOpenBrace <= foundDelimeter && foundOpenBrace <= foundOpenDict ) { compensation = false; delimStack.push("]"); curPos = foundOpenBrace + openArray.size(); } else if( foundOpenDict <= foundDelimeter && foundOpenDict <= foundOpenBrace ) { compensation = false; delimStack.push(">"); curPos = foundOpenDict + openDict.size(); } if( delimStack.empty() ) { foundEnd = content.find_first_of(delimeter,curPos); if( foundEnd == std::string::npos ) { foundEnd = curPos; } break; } delimeter = delimStack.top(); } return foundEnd; }