<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>mFabrik - mobile sites, apps, HTML5 and CMS software development &#187; ocr</title>
	<atom:link href="http://blog.mfabrik.com/tag/ocr/feed/" rel="self" type="application/rss+xml" />
	<link>http://blog.mfabrik.com</link>
	<description>Freedom delivered.</description>
	<lastBuildDate>Wed, 03 Aug 2011 09:47:41 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.2.1</generator>
		<item>
		<title>Viivi &amp; Wagner strip scraper</title>
		<link>http://blog.mfabrik.com/2008/05/07/viivi-wagner-strip-scaper/</link>
		<comments>http://blog.mfabrik.com/2008/05/07/viivi-wagner-strip-scaper/#comments</comments>
		<pubDate>Tue, 06 May 2008 22:33:22 +0000</pubDate>
		<dc:creator>Mikko Ohtamaa</dc:creator>
				<category><![CDATA[python]]></category>
		<category><![CDATA[beuatifulsoap]]></category>
		<category><![CDATA[challenge]]></category>
		<category><![CDATA[comic]]></category>
		<category><![CDATA[downloader]]></category>
		<category><![CDATA[ocr]]></category>
		<category><![CDATA[programming language]]></category>
		<category><![CDATA[scaper]]></category>
		<category><![CDATA[spider]]></category>
		<category><![CDATA[strip]]></category>
		<category><![CDATA[viivi ja wagner]]></category>
		<category><![CDATA[web]]></category>

		<guid isPermaLink="false">http://blog.redinnovation.com/?p=40</guid>
		<description><![CDATA[I wrote this little script as a mental exercise and to prove the power of Python programming language. If anyone accepts the challenge, I&#8217;d like to see submissions in other programming langauges For the foreigners: this is the best comic in Finland, so I hope you&#8217;ll get translations soon! It tells about the relationship of [...]]]></description>
			<content:encoded><![CDATA[<p>I wrote this little script as a mental exercise and to prove the power of Python programming language. If anyone accepts the challenge, I&#8217;d like to see submissions in other programming langauges <img src='http://blog.mfabrik.com/wp-includes/images/smilies/icon_wink.gif' alt=';)' class='wp-smiley' /> </p>
<p>For the foreigners: this is the best comic in Finland, so I hope you&#8217;ll get translations soon! It tells about the relationship of a woman and a pig (sic) reflecting the deepest shadows of Finnish social life.</p>
<pre>"""
	Creats local mirror from Viivi &amp; Wagner strips by fetching all of them from hs.fi.

	Will create downloaded strips as
		2004/1.1.2004.gif
		2004/2.1.2004.gif
		...
		until today

	Try this in C++!

	Motivation: No one has build Viivi &amp; Wagner search engine with speech bubble OCR support
	and I desperately wanted to find "Kottarainen lentaa korvaan" strip for my gf.

	Time to complete: 20 min.

"""

__docformat__ = "epytext"
__author__ = "Mikko Ohtamaa"
__license__ = "BSD"
__copyright__ = "2008 Mikko Ohtamaa"

import os
import re
import urllib
from BeautifulSoup import BeautifulSoup

# 1.1.2004 start page
url = "http://www.hs.fi/viivijawagner/1073386660690"

# Loop until there is no longer next link
while True:
	stream = urllib.urlopen(url)
	html = stream.read()
	stream.close()
	soup = BeautifulSoup(html)

	# Parse strip date from contents
	date = None

	# Find strip date, which is next to a title
	h1 = soup.findAll(text="Viivi ja Wagner")
	# Should be present always
	date = h1[0].parent.parent.p.string

	print "Fetching " + date

	# Scrape strip
	strip = soup.findAll("div" , { "class" : "strip" })
	img = strip[0].img

	stream = urllib.urlopen(img["src"])
	data = stream.read()
	stream.close()

	# For each year, give a new folder to avoid file system stress
	# (lotsa files in a folder kill poor Gnome)
	day, month, year = date.split(".")
	folder = year

	if not os.path.exists(folder):
		os.mkdir(folder)	

	# Store contents
	fname = os.path.join(folder, date + ".gif")
	f = open(fname, "wb")
	f.write(data)
	f.close()

	# Find next url, it is a containing one img tag
	img = soup.findAll(alt="seuraava")
        if len(img) == 0:
             break
	a = img[0].parent
	url = a["href"]
</pre>
<h2>See preview</h2>
<p><a href="http://blog.redinnovation.com/wp-content/uploads/2008/05/screenshot-2004-file-browser1.png"><img class="alignnone size-thumbnail wp-image-42" title="screenshot-2004-file-browser1" src="http://blog.redinnovation.com/wp-content/uploads/2008/05/screenshot-2004-file-browser1-150x150.png" alt="" width="150" height="150" /></a><a href="http://blog.redinnovation.com/wp-content/uploads/2008/05/screenshot-2004-file-browser.png"><br />
</a></p>
]]></content:encoded>
			<wfw:commentRss>http://blog.mfabrik.com/2008/05/07/viivi-wagner-strip-scaper/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
		</item>
	</channel>
</rss>

