slate file incorrectly initializes the arguments
up vote
0
down vote
favorite
My slate file for parsing pdf is not working correctly. I get the error attached:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__
self.doc = PDFDocument(file)
TypeError: __init__() takes at least 2 arguments (1 given)
for this slate file. I had to modify the import pdfminer... to correct from pdfminer.pdfdocument import PDFDocument for a previous issue but can't figure out how to correct this one from analyzing traceback statement... I tried finding the missing error... Any help with how to correct this problem would much appreciated, thanks!
from StringIO import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter as PI
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
import utils
__all__ = ['PDF']
class PDFPageInterpreter(PI):
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.outfp.seek(0)
self.device.outfp.buf = ''
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return self.device.outfp.getvalue()
class PDF(list):
def __init__(self, file, password='', just_text=1):
self.parser = PDFParser(file)
self.doc = PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.doc.initialize(password)
if self.doc.is_extractable:
self.resmgr = PDFResourceManager()
self.device = TextConverter(self.resmgr, outfp=StringIO())
self.interpreter = PDFPageInterpreter(
self.resmgr, self.device)
for page in self.doc.get_pages():
self.append(self.interpreter.process_page(page))
self.metadata = self.doc.info
if just_text:
self._cleanup()
def _cleanup(self):
"""
Frees lots of non-textual information, such as the fonts
and images and the objects that were needed to parse the
PDF.
"""
del self.device
del self.doc
del self.parser
del self.resmgr
del self.interpreter
def text(self, clean=True):
"""
Returns the text of the PDF as a single string.
Options:
:clean:
Removes misc cruft, like lots of whitespace.
"""
if clean:
return ''.join(utils.trim_whitespace(page) for page in self)
else:
return ''.join(self)
python ubuntu pdf slate
add a comment |
up vote
0
down vote
favorite
My slate file for parsing pdf is not working correctly. I get the error attached:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__
self.doc = PDFDocument(file)
TypeError: __init__() takes at least 2 arguments (1 given)
for this slate file. I had to modify the import pdfminer... to correct from pdfminer.pdfdocument import PDFDocument for a previous issue but can't figure out how to correct this one from analyzing traceback statement... I tried finding the missing error... Any help with how to correct this problem would much appreciated, thanks!
from StringIO import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter as PI
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
import utils
__all__ = ['PDF']
class PDFPageInterpreter(PI):
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.outfp.seek(0)
self.device.outfp.buf = ''
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return self.device.outfp.getvalue()
class PDF(list):
def __init__(self, file, password='', just_text=1):
self.parser = PDFParser(file)
self.doc = PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.doc.initialize(password)
if self.doc.is_extractable:
self.resmgr = PDFResourceManager()
self.device = TextConverter(self.resmgr, outfp=StringIO())
self.interpreter = PDFPageInterpreter(
self.resmgr, self.device)
for page in self.doc.get_pages():
self.append(self.interpreter.process_page(page))
self.metadata = self.doc.info
if just_text:
self._cleanup()
def _cleanup(self):
"""
Frees lots of non-textual information, such as the fonts
and images and the objects that were needed to parse the
PDF.
"""
del self.device
del self.doc
del self.parser
del self.resmgr
del self.interpreter
def text(self, clean=True):
"""
Returns the text of the PDF as a single string.
Options:
:clean:
Removes misc cruft, like lots of whitespace.
"""
if clean:
return ''.join(utils.trim_whitespace(page) for page in self)
else:
return ''.join(self)
python ubuntu pdf slate
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
My slate file for parsing pdf is not working correctly. I get the error attached:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__
self.doc = PDFDocument(file)
TypeError: __init__() takes at least 2 arguments (1 given)
for this slate file. I had to modify the import pdfminer... to correct from pdfminer.pdfdocument import PDFDocument for a previous issue but can't figure out how to correct this one from analyzing traceback statement... I tried finding the missing error... Any help with how to correct this problem would much appreciated, thanks!
from StringIO import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter as PI
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
import utils
__all__ = ['PDF']
class PDFPageInterpreter(PI):
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.outfp.seek(0)
self.device.outfp.buf = ''
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return self.device.outfp.getvalue()
class PDF(list):
def __init__(self, file, password='', just_text=1):
self.parser = PDFParser(file)
self.doc = PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.doc.initialize(password)
if self.doc.is_extractable:
self.resmgr = PDFResourceManager()
self.device = TextConverter(self.resmgr, outfp=StringIO())
self.interpreter = PDFPageInterpreter(
self.resmgr, self.device)
for page in self.doc.get_pages():
self.append(self.interpreter.process_page(page))
self.metadata = self.doc.info
if just_text:
self._cleanup()
def _cleanup(self):
"""
Frees lots of non-textual information, such as the fonts
and images and the objects that were needed to parse the
PDF.
"""
del self.device
del self.doc
del self.parser
del self.resmgr
del self.interpreter
def text(self, clean=True):
"""
Returns the text of the PDF as a single string.
Options:
:clean:
Removes misc cruft, like lots of whitespace.
"""
if clean:
return ''.join(utils.trim_whitespace(page) for page in self)
else:
return ''.join(self)
python ubuntu pdf slate
My slate file for parsing pdf is not working correctly. I get the error attached:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/home/ryan/.local/lib/python2.7/site-packages/slate/slate.py", line 37, in __init__
self.doc = PDFDocument(file)
TypeError: __init__() takes at least 2 arguments (1 given)
for this slate file. I had to modify the import pdfminer... to correct from pdfminer.pdfdocument import PDFDocument for a previous issue but can't figure out how to correct this one from analyzing traceback statement... I tried finding the missing error... Any help with how to correct this problem would much appreciated, thanks!
from StringIO import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter as PI
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
import utils
__all__ = ['PDF']
class PDFPageInterpreter(PI):
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
else:
ctm = (1,0,0,1, -x0,-y0)
self.device.outfp.seek(0)
self.device.outfp.buf = ''
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return self.device.outfp.getvalue()
class PDF(list):
def __init__(self, file, password='', just_text=1):
self.parser = PDFParser(file)
self.doc = PDFDocument()
self.parser.set_document(self.doc)
self.doc.set_parser(self.parser)
self.doc.initialize(password)
if self.doc.is_extractable:
self.resmgr = PDFResourceManager()
self.device = TextConverter(self.resmgr, outfp=StringIO())
self.interpreter = PDFPageInterpreter(
self.resmgr, self.device)
for page in self.doc.get_pages():
self.append(self.interpreter.process_page(page))
self.metadata = self.doc.info
if just_text:
self._cleanup()
def _cleanup(self):
"""
Frees lots of non-textual information, such as the fonts
and images and the objects that were needed to parse the
PDF.
"""
del self.device
del self.doc
del self.parser
del self.resmgr
del self.interpreter
def text(self, clean=True):
"""
Returns the text of the PDF as a single string.
Options:
:clean:
Removes misc cruft, like lots of whitespace.
"""
if clean:
return ''.join(utils.trim_whitespace(page) for page in self)
else:
return ''.join(self)
python ubuntu pdf slate
python ubuntu pdf slate
asked Nov 8 at 2:19
RyanWolfe9013
264
264
add a comment |
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53200632%2fslate-file-incorrectly-initializes-the-arguments%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown