[Mayan EDMS: 176] Xpdf patches

classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[Mayan EDMS: 176] Xpdf patches

Brian E
Hi,

Trying out Mayan for a document management server and have been very
impressed so far.  However, we ran into some problems with pdf parsing
and I modified a few routines to use Xpdf tools which seems to be much
faster than the graphicsmagick backend.

This probably breaks for non pdf files, but it would be a nice
addition for pdf parsing:

apps/converter/backends/graphicsmagick/base.py


 def get_page_count(self, input_filepath):
        command = []
        command.append('pdfinfo')
        command.append(unicode(input_filepath))
        proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            print proc.stderr.readline()

        output = proc.stdout.read().splitlines()

        numpages = -1
        for line in output:
            matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)

            if matchObj:
                numpages = matchObj.group(1)
                break
            else:
                print "No match!!"

        if numpages < 0:
            raise UnknownFileFormat
        else:
            return int(numpages)


PDF text parsing: /apps/ocr/parsers/__init__.py

def pdf_parser(document_page, descriptor=None):

    logger.debug('parsing PDF')
    pagenum = str(document_page.page_number)

    logger.debug('parsing PDF page %s' % pagenum)

    command = []
    command.append('pdftotext')
    command.append('-f')
    command.append(pagenum)
    command.append('-l')
    command.append(pagenum)
    command.append(unicode(document_page.document_version.file.path))
    command.append('-')

    proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
        print proc.stderr.readline()
        raise ParserError

    output = proc.stdout.read()
    numalpha = len( filter(str.isalpha, output) )
    numother = len( filter(notalphaorspace, output) )

    logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
numother))

    if numother > numalpha:
        logger.debug("parser error... probably scanned pdf.")
        raise ParserError

    document_page.content = output
    document_page.page_label = _(u'Text extracted from PDF')
    document_page.save()


Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[Mayan EDMS: 178] Re: Xpdf patches

rosarior
Administrator
Excellent!  I will find a way to get this running along side the existing code.  Thanks for you contribution!

On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
Hi,

Trying out Mayan for a document management server and have been very
impressed so far.  However, we ran into some problems with pdf parsing
and I modified a few routines to use Xpdf tools which seems to be much
faster than the graphicsmagick backend.

This probably breaks for non pdf files, but it would be a nice
addition for pdf parsing:

apps/converter/backends/graphicsmagick/base.py


 def get_page_count(self, input_filepath):
        command = []
        command.append('pdfinfo')
        command.append(unicode(input_filepath))
        proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            print proc.stderr.readline()

        output = proc.stdout.read().splitlines()

        numpages = -1
        for line in output:
            matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)

            if matchObj:
                numpages = matchObj.group(1)
                break
            else:
                print "No match!!"

        if numpages < 0:
            raise UnknownFileFormat
        else:
            return int(numpages)


PDF text parsing: /apps/ocr/parsers/__init__.py

def pdf_parser(document_page, descriptor=None):

    logger.debug('parsing PDF')
    pagenum = str(document_page.page_number)

    logger.debug('parsing PDF page %s' % pagenum)

    command = []
    command.append('pdftotext')
    command.append('-f')
    command.append(pagenum)
    command.append('-l')
    command.append(pagenum)
    command.append(unicode(document_page.document_version.file.path))
    command.append('-')

    proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
        print proc.stderr.readline()
        raise ParserError

    output = proc.stdout.read()
    numalpha = len( filter(str.isalpha, output) )
    numother = len( filter(notalphaorspace, output) )

    logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
numother))

    if numother > numalpha:
        logger.debug("parser error... probably scanned pdf.")
        raise ParserError

    document_page.content = output
    document_page.page_label = _(u'Text extracted from PDF')
    document_page.save()


Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

[Mayan EDMS: 198] Re: Xpdf patches

rosarior
Administrator
Hi Brian,

I started integrating your patch and hit an error with this: "global name 'notalphaorspace' is not defined" in this line "numother = len( filter(notalphaorspace, output) ) "  What is notalphaorspace meant to do?

Thanks.


On Sunday, May 13, 2012 1:57:30 PM UTC-4, Roberto Rosario wrote:
Excellent!  I will find a way to get this running along side the existing code.  Thanks for you contribution!

On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
Hi,

Trying out Mayan for a document management server and have been very
impressed so far.  However, we ran into some problems with pdf parsing
and I modified a few routines to use Xpdf tools which seems to be much
faster than the graphicsmagick backend.

This probably breaks for non pdf files, but it would be a nice
addition for pdf parsing:

apps/converter/backends/graphicsmagick/base.py


 def get_page_count(self, input_filepath):
        command = []
        command.append('pdfinfo')
        command.append(unicode(input_filepath))
        proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            print proc.stderr.readline()

        output = proc.stdout.read().splitlines()

        numpages = -1
        for line in output:
            matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)

            if matchObj:
                numpages = matchObj.group(1)
                break
            else:
                print "No match!!"

        if numpages < 0:
            raise UnknownFileFormat
        else:
            return int(numpages)


PDF text parsing: /apps/ocr/parsers/__init__.py

def pdf_parser(document_page, descriptor=None):

    logger.debug('parsing PDF')
    pagenum = str(document_page.page_number)

    logger.debug('parsing PDF page %s' % pagenum)

    command = []
    command.append('pdftotext')
    command.append('-f')
    command.append(pagenum)
    command.append('-l')
    command.append(pagenum)
    command.append(unicode(document_page.document_version.file.path))
    command.append('-')

    proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
        print proc.stderr.readline()
        raise ParserError

    output = proc.stdout.read()
    numalpha = len( filter(str.isalpha, output) )
    numother = len( filter(notalphaorspace, output) )

    logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
numother))

    if numother > numalpha:
        logger.debug("parser error... probably scanned pdf.")
        raise ParserError

    document_page.content = output
    document_page.page_label = _(u'Text extracted from PDF')
    document_page.save()


Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: [Mayan EDMS: 198] Re: Xpdf patches

Brian E
Hi Roberto...

Forgot that part.  It just returns true if the character isn't alpha
or whitespace


def notalphaorspace(s):
    if not s.isalpha() and not s.isspace():
        return True
    else:
        return False


You may want to cut that entire bit of code out where it counts
characters.  It's just a lame attempt at detecting scanned PDFs that
doesn't seem to work.  The idea was that if there were more alpha
characters than non alpha then the text parsed ok and it wasn't a
scanned pdf.

brian.

On Wed, May 30, 2012 at 12:39 PM, Roberto Rosario
<[hidden email]> wrote:

> Hi Brian,
>
> I started integrating your patch and hit an error with this: "global name
> 'notalphaorspace' is not defined" in this line "numother = len(
> filter(notalphaorspace, output) ) "  What is notalphaorspace meant to do?
>
> Thanks.
>
>
> On Sunday, May 13, 2012 1:57:30 PM UTC-4, Roberto Rosario wrote:
>>
>> Excellent!  I will find a way to get this running along side the existing
>> code.  Thanks for you contribution!
>>
>> On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
>>>
>>> Hi,
>>>
>>> Trying out Mayan for a document management server and have been very
>>> impressed so far.  However, we ran into some problems with pdf parsing
>>> and I modified a few routines to use Xpdf tools which seems to be much
>>> faster than the graphicsmagick backend.
>>>
>>> This probably breaks for non pdf files, but it would be a nice
>>> addition for pdf parsing:
>>>
>>> apps/converter/backends/graphicsmagick/base.py
>>>
>>>
>>>  def get_page_count(self, input_filepath):
>>>         command = []
>>>         command.append('pdfinfo')
>>>         command.append(unicode(input_filepath))
>>>         proc = subprocess.Popen(command, close_fds=True,
>>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>>>         return_code = proc.wait()
>>>         if return_code != 0:
>>>             print proc.stderr.readline()
>>>
>>>         output = proc.stdout.read().splitlines()
>>>
>>>         numpages = -1
>>>         for line in output:
>>>             matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)
>>>
>>>             if matchObj:
>>>                 numpages = matchObj.group(1)
>>>                 break
>>>             else:
>>>                 print "No match!!"
>>>
>>>         if numpages < 0:
>>>             raise UnknownFileFormat
>>>         else:
>>>             return int(numpages)
>>>
>>>
>>> PDF text parsing: /apps/ocr/parsers/__init__.py
>>>
>>> def pdf_parser(document_page, descriptor=None):
>>>
>>>     logger.debug('parsing PDF')
>>>     pagenum = str(document_page.page_number)
>>>
>>>     logger.debug('parsing PDF page %s' % pagenum)
>>>
>>>     command = []
>>>     command.append('pdftotext')
>>>     command.append('-f')
>>>     command.append(pagenum)
>>>     command.append('-l')
>>>     command.append(pagenum)
>>>     command.append(unicode(document_page.document_version.file.path))
>>>     command.append('-')
>>>
>>>     proc = subprocess.Popen(command, close_fds=True,
>>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>>>     return_code = proc.wait()
>>>     if return_code != 0:
>>>         print proc.stderr.readline()
>>>         raise ParserError
>>>
>>>     output = proc.stdout.read()
>>>     numalpha = len( filter(str.isalpha, output) )
>>>     numother = len( filter(notalphaorspace, output) )
>>>
>>>     logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
>>> numother))
>>>
>>>     if numother > numalpha:
>>>         logger.debug("parser error... probably scanned pdf.")
>>>         raise ParserError
>>>
>>>     document_page.content = output
>>>     document_page.page_label = _(u'Text extracted from PDF')
>>>     document_page.save()
>>>
>>>
>
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: [Mayan EDMS: 200] Re: Xpdf patches

rosarior
Administrator
Ok, thanks for the clarification :)  The parser is in the code: https://github.com/rosarior/mayan/blob/hotfix/v0.12.1/apps/ocr/parsers/__init__.py#L114 hope to release v0.12.1 by next week if everything tests ok.

Roberto

On Wednesday, May 30, 2012 3:45:34 PM UTC-4, Brian E wrote:
Hi Roberto...

Forgot that part.  It just returns true if the character isn't alpha
or whitespace


def notalphaorspace(s):
    if not s.isalpha() and not s.isspace():
        return True
    else:
        return False


You may want to cut that entire bit of code out where it counts
characters.  It's just a lame attempt at detecting scanned PDFs that
doesn't seem to work.  The idea was that if there were more alpha
characters than non alpha then the text parsed ok and it wasn't a
scanned pdf.

brian.

On Wed, May 30, 2012 at 12:39 PM, Roberto Rosario
<roberto...@gmail.com> wrote:

> Hi Brian,
>
> I started integrating your patch and hit an error with this: "global name
> 'notalphaorspace' is not defined" in this line "numother = len(
> filter(notalphaorspace, output) ) "  What is notalphaorspace meant to do?
>
> Thanks.
>
>
> On Sunday, May 13, 2012 1:57:30 PM UTC-4, Roberto Rosario wrote:
>>
>> Excellent!  I will find a way to get this running along side the existing
>> code.  Thanks for you contribution!
>>
>> On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
>>>
>>> Hi,
>>>
>>> Trying out Mayan for a document management server and have been very
>>> impressed so far.  However, we ran into some problems with pdf parsing
>>> and I modified a few routines to use Xpdf tools which seems to be much
>>> faster than the graphicsmagick backend.
>>>
>>> This probably breaks for non pdf files, but it would be a nice
>>> addition for pdf parsing:
>>>
>>> apps/converter/backends/graphicsmagick/base.py
>>>
>>>
>>>  def get_page_count(self, input_filepath):
>>>         command = []
>>>         command.append('pdfinfo')
>>>         command.append(unicode(input_filepath))
>>>         proc = subprocess.Popen(command, close_fds=True,
>>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>>>         return_code = proc.wait()
>>>         if return_code != 0:
>>>             print proc.stderr.readline()
>>>
>>>         output = proc.stdout.read().splitlines()
>>>
>>>         numpages = -1
>>>         for line in output:
>>>             matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)
>>>
>>>             if matchObj:
>>>                 numpages = matchObj.group(1)
>>>                 break
>>>             else:
>>>                 print "No match!!"
>>>
>>>         if numpages < 0:
>>>             raise UnknownFileFormat
>>>         else:
>>>             return int(numpages)
>>>
>>>
>>> PDF text parsing: /apps/ocr/parsers/__init__.py
>>>
>>> def pdf_parser(document_page, descriptor=None):
>>>
>>>     logger.debug('parsing PDF')
>>>     pagenum = str(document_page.page_number)
>>>
>>>     logger.debug('parsing PDF page %s' % pagenum)
>>>
>>>     command = []
>>>     command.append('pdftotext')
>>>     command.append('-f')
>>>     command.append(pagenum)
>>>     command.append('-l')
>>>     command.append(pagenum)
>>>     command.append(unicode(document_page.document_version.file.path))
>>>     command.append('-')
>>>
>>>     proc = subprocess.Popen(command, close_fds=True,
>>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>>>     return_code = proc.wait()
>>>     if return_code != 0:
>>>         print proc.stderr.readline()
>>>         raise ParserError
>>>
>>>     output = proc.stdout.read()
>>>     numalpha = len( filter(str.isalpha, output) )
>>>     numother = len( filter(notalphaorspace, output) )
>>>
>>>     logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
>>> numother))
>>>
>>>     if numother > numalpha:
>>>         logger.debug("parser error... probably scanned pdf.")
>>>         raise ParserError
>>>
>>>     document_page.content = output
>>>     document_page.page_label = _(u'Text extracted from PDF')
>>>     document_page.save()
>>>
>>>
>
Loading...