slate file incorrectly initializes the arguments

up vote
0
down vote

favorite

My slate file for parsing pdf is not working correctly. I get the error attached:

Traceback (most recent call last):

  File "<stdin>", line 2, in <module>

  File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__

    self.doc = PDFDocument(file)

TypeError: __init__() takes at least 2 arguments (1 given)

for this slate file. I had to modify the import pdfminer... to correct from pdfminer.pdfdocument import PDFDocument for a previous issue but can't figure out how to correct this one from analyzing traceback statement... I tried finding the missing error... Any help with how to correct this problem would much appreciated, thanks!

from StringIO import StringIO



from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager

from pdfminer.pdfinterp import PDFPageInterpreter as PI

from pdfminer.pdfdevice import PDFDevice

from pdfminer.converter import TextConverter



import utils



__all__ = ['PDF']



class PDFPageInterpreter(PI):

    def process_page(self, page):

        if 1 <= self.debug:

            print >>stderr, 'Processing page: %r' % page

        (x0,y0,x1,y1) = page.mediabox

        if page.rotate == 90:

            ctm = (0,-1,1,0, -y0,x1)

        elif page.rotate == 180:

            ctm = (-1,0,0,-1, x1,y1)

        elif page.rotate == 270:

            ctm = (0,1,-1,0, y1,-x0)

        else:

            ctm = (1,0,0,1, -x0,-y0)

        self.device.outfp.seek(0)

        self.device.outfp.buf = ''

        self.device.begin_page(page, ctm)

        self.render_contents(page.resources, page.contents, ctm=ctm)

        self.device.end_page(page)

        return self.device.outfp.getvalue()



class PDF(list):

    def __init__(self, file, password='', just_text=1):

        self.parser = PDFParser(file)

        self.doc = PDFDocument()

        self.parser.set_document(self.doc)

        self.doc.set_parser(self.parser)

        self.doc.initialize(password)

        if self.doc.is_extractable:

            self.resmgr = PDFResourceManager()

            self.device = TextConverter(self.resmgr, outfp=StringIO())

            self.interpreter = PDFPageInterpreter(

            self.resmgr, self.device)

            for page in self.doc.get_pages():

                self.append(self.interpreter.process_page(page))

            self.metadata = self.doc.info

        if just_text:

            self._cleanup()



    def _cleanup(self):

        """ 

        Frees lots of non-textual information, such as the fonts

        and images and the objects that were needed to parse the

        PDF.

        """

        del self.device

        del self.doc

        del self.parser

        del self.resmgr

        del self.interpreter



    def text(self, clean=True):

        """ 

        Returns the text of the PDF as a single string.

        Options:



          :clean:

            Removes misc cruft, like lots of whitespace.

        """

        if clean:

            return ''.join(utils.trim_whitespace(page) for page in self)

        else:

            return ''.join(self)

asked Nov 8 at 2:19

RyanWolfe9013

264

add a comment |

up vote
0
down vote

favorite

My slate file for parsing pdf is not working correctly. I get the error attached:

Traceback (most recent call last):

  File "<stdin>", line 2, in <module>

  File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__

    self.doc = PDFDocument(file)

TypeError: __init__() takes at least 2 arguments (1 given)

from StringIO import StringIO



from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager

from pdfminer.pdfinterp import PDFPageInterpreter as PI

from pdfminer.pdfdevice import PDFDevice

from pdfminer.converter import TextConverter



import utils



__all__ = ['PDF']



class PDFPageInterpreter(PI):

    def process_page(self, page):

        if 1 <= self.debug:

            print >>stderr, 'Processing page: %r' % page

        (x0,y0,x1,y1) = page.mediabox

        if page.rotate == 90:

            ctm = (0,-1,1,0, -y0,x1)

        elif page.rotate == 180:

            ctm = (-1,0,0,-1, x1,y1)

        elif page.rotate == 270:

            ctm = (0,1,-1,0, y1,-x0)

        else:

            ctm = (1,0,0,1, -x0,-y0)

        self.device.outfp.seek(0)

        self.device.outfp.buf = ''

        self.device.begin_page(page, ctm)

        self.render_contents(page.resources, page.contents, ctm=ctm)

        self.device.end_page(page)

        return self.device.outfp.getvalue()



class PDF(list):

    def __init__(self, file, password='', just_text=1):

        self.parser = PDFParser(file)

        self.doc = PDFDocument()

        self.parser.set_document(self.doc)

        self.doc.set_parser(self.parser)

        self.doc.initialize(password)

        if self.doc.is_extractable:

            self.resmgr = PDFResourceManager()

            self.device = TextConverter(self.resmgr, outfp=StringIO())

            self.interpreter = PDFPageInterpreter(

            self.resmgr, self.device)

            for page in self.doc.get_pages():

                self.append(self.interpreter.process_page(page))

            self.metadata = self.doc.info

        if just_text:

            self._cleanup()



    def _cleanup(self):

        """ 

        Frees lots of non-textual information, such as the fonts

        and images and the objects that were needed to parse the

        PDF.

        """

        del self.device

        del self.doc

        del self.parser

        del self.resmgr

        del self.interpreter



    def text(self, clean=True):

        """ 

        Returns the text of the PDF as a single string.

        Options:



          :clean:

            Removes misc cruft, like lots of whitespace.

        """

        if clean:

            return ''.join(utils.trim_whitespace(page) for page in self)

        else:

            return ''.join(self)

asked Nov 8 at 2:19

RyanWolfe9013

264

add a comment |

up vote
0
down vote

favorite

My slate file for parsing pdf is not working correctly. I get the error attached:

Traceback (most recent call last):

  File "<stdin>", line 2, in <module>

  File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__

    self.doc = PDFDocument(file)

TypeError: __init__() takes at least 2 arguments (1 given)

from StringIO import StringIO



from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager

from pdfminer.pdfinterp import PDFPageInterpreter as PI

from pdfminer.pdfdevice import PDFDevice

from pdfminer.converter import TextConverter



import utils



__all__ = ['PDF']



class PDFPageInterpreter(PI):

    def process_page(self, page):

        if 1 <= self.debug:

            print >>stderr, 'Processing page: %r' % page

        (x0,y0,x1,y1) = page.mediabox

        if page.rotate == 90:

            ctm = (0,-1,1,0, -y0,x1)

        elif page.rotate == 180:

            ctm = (-1,0,0,-1, x1,y1)

        elif page.rotate == 270:

            ctm = (0,1,-1,0, y1,-x0)

        else:

            ctm = (1,0,0,1, -x0,-y0)

        self.device.outfp.seek(0)

        self.device.outfp.buf = ''

        self.device.begin_page(page, ctm)

        self.render_contents(page.resources, page.contents, ctm=ctm)

        self.device.end_page(page)

        return self.device.outfp.getvalue()



class PDF(list):

    def __init__(self, file, password='', just_text=1):

        self.parser = PDFParser(file)

        self.doc = PDFDocument()

        self.parser.set_document(self.doc)

        self.doc.set_parser(self.parser)

        self.doc.initialize(password)

        if self.doc.is_extractable:

            self.resmgr = PDFResourceManager()

            self.device = TextConverter(self.resmgr, outfp=StringIO())

            self.interpreter = PDFPageInterpreter(

            self.resmgr, self.device)

            for page in self.doc.get_pages():

                self.append(self.interpreter.process_page(page))

            self.metadata = self.doc.info

        if just_text:

            self._cleanup()



    def _cleanup(self):

        """ 

        Frees lots of non-textual information, such as the fonts

        and images and the objects that were needed to parse the

        PDF.

        """

        del self.device

        del self.doc

        del self.parser

        del self.resmgr

        del self.interpreter



    def text(self, clean=True):

        """ 

        Returns the text of the PDF as a single string.

        Options:



          :clean:

            Removes misc cruft, like lots of whitespace.

        """

        if clean:

            return ''.join(utils.trim_whitespace(page) for page in self)

        else:

            return ''.join(self)

asked Nov 8 at 2:19

RyanWolfe9013

264

My slate file for parsing pdf is not working correctly. I get the error attached:

Traceback (most recent call last):

  File "<stdin>", line 2, in <module>

  File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__

    self.doc = PDFDocument(file)

TypeError: __init__() takes at least 2 arguments (1 given)

from StringIO import StringIO



from pdfminer.pdfparser import PDFParser

from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager

from pdfminer.pdfinterp import PDFPageInterpreter as PI

from pdfminer.pdfdevice import PDFDevice

from pdfminer.converter import TextConverter



import utils



__all__ = ['PDF']



class PDFPageInterpreter(PI):

    def process_page(self, page):

        if 1 <= self.debug:

            print >>stderr, 'Processing page: %r' % page

        (x0,y0,x1,y1) = page.mediabox

        if page.rotate == 90:

            ctm = (0,-1,1,0, -y0,x1)

        elif page.rotate == 180:

            ctm = (-1,0,0,-1, x1,y1)

        elif page.rotate == 270:

            ctm = (0,1,-1,0, y1,-x0)

        else:

            ctm = (1,0,0,1, -x0,-y0)

        self.device.outfp.seek(0)

        self.device.outfp.buf = ''

        self.device.begin_page(page, ctm)

        self.render_contents(page.resources, page.contents, ctm=ctm)

        self.device.end_page(page)

        return self.device.outfp.getvalue()



class PDF(list):

    def __init__(self, file, password='', just_text=1):

        self.parser = PDFParser(file)

        self.doc = PDFDocument()

        self.parser.set_document(self.doc)

        self.doc.set_parser(self.parser)

        self.doc.initialize(password)

        if self.doc.is_extractable:

            self.resmgr = PDFResourceManager()

            self.device = TextConverter(self.resmgr, outfp=StringIO())

            self.interpreter = PDFPageInterpreter(

            self.resmgr, self.device)

            for page in self.doc.get_pages():

                self.append(self.interpreter.process_page(page))

            self.metadata = self.doc.info

        if just_text:

            self._cleanup()



    def _cleanup(self):

        """ 

        Frees lots of non-textual information, such as the fonts

        and images and the objects that were needed to parse the

        PDF.

        """

        del self.device

        del self.doc

        del self.parser

        del self.resmgr

        del self.interpreter



    def text(self, clean=True):

        """ 

        Returns the text of the PDF as a single string.

        Options:



          :clean:

            Removes misc cruft, like lots of whitespace.

        """

        if clean:

            return ''.join(utils.trim_whitespace(page) for page in self)

        else:

            return ''.join(self)

python ubuntu pdf slate

asked Nov 8 at 2:19

RyanWolfe9013

264

asked Nov 8 at 2:19

RyanWolfe9013

264

asked Nov 8 at 2:19

RyanWolfe9013

264

asked Nov 8 at 2:19

RyanWolfe9013

264

asked Nov 8 at 2:19

RyanWolfe9013

264

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53200632%2fslate-file-incorrectly-initializes-the-arguments%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Wsrtjtyk