dataLoad.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. #!/usr/bin/env python3
  2. from PIL import Image
  3. from pprint import pprint
  4. <<<<<<< Updated upstream
  5. import sys, time, os, requests, random, json, argparse
  6. import subprocess, yaml
  7. =======
  8. import sys
  9. import time
  10. import os
  11. import requests
  12. import random
  13. import json
  14. import argparse
  15. import subprocess
  16. >>>>>>> Stashed changes
  17. parser = argparse.ArgumentParser(description="S0urce.io utility program.")
  18. parser.add_argument("--download", help="Download Images", action="store_true")
  19. parser.add_argument("--train", help="Convert Images to Text", action="store_true")
  20. parser.add_argument(
  21. "JSON", type=str, nargs="?", help="Filename to save results", default="test.js"
  22. )
  23. args = parser.parse_args()
  24. # pprint(args)
  25. # Should we add the JSON in a file? (True is filename, False = do not do)
  26. # JSONME = 'test.js'
  27. JSONME = args.JSON
  28. # NOTE: To begin the insert of the JSONIFIED image and word its
  29. # // T
  30. # A JS comment with a uppercase T
  31. # To stop its
  32. # // t
  33. # A JS comment with a lowercase t
  34. # httpbin.org/headers
  35. sess = requests.Session()
  36. head = {
  37. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
  38. }
  39. sess.headers.update(head)
  40. ON = "X" # Dark pixel in an image
  41. OFF = "." # Light pixel in an image
  42. DIR = (
  43. "data"
  44. ) # Data directory name, do we really need this? Is it really going to change?
  45. INTENSITY = (
  46. 75
  47. ) # How bright does something have to be to trigger it being a dark or light pixel?
  48. # Looks like around 75 removes the extra stuff that s0urce.io does to prevent it from being just matching images.
  49. GREEN_DIFF = 10
  50. # How much brighter the green channel must be (compared to the others),
  51. # to be called green.
  52. # Check the environment, do we have all that we need?
  53. if not os.path.exists("images"):
  54. os.mkdir("images")
  55. if not os.path.exists("data"):
  56. os.mkdir("data")
  57. if not os.path.exists("words.yml"):
  58. with open('words.yml', 'w') as f: # Create a empty yaml file
  59. f.write('')
  60. def download(howhard, index):
  61. global sess
  62. """
  63. Download an image based upon how hard it is.
  64. On success, it saves the image file.
  65. Failure raises ConnectionError.
  66. """
  67. r = sess.get(f"http://s0urce.io/client/img/word/{howhard}/{index}")
  68. if r.status_code == 200:
  69. with open(f"images/{howhard}_{index}.png", "wb") as f:
  70. f.write(r.content)
  71. else:
  72. # We did not get a 200 Okay, log this... Hmm maybe we need to make a log file?
  73. # print( f'{howhard}_{index}.png ' + str(r.status_code) )
  74. raise ConnectionError(
  75. "http://s0urce.io/client/img/word/{0}/{1} returned status_code {2}".format(
  76. howhard, index, r.status_code
  77. )
  78. )
  79. def img_point(pix, x, y):
  80. """
  81. img_point, returns a pixel of an image,
  82. given the x and y on the image.
  83. """
  84. return pix[x, y]
  85. def img_avg(pix, x, y):
  86. """
  87. img_avg, returns the average brightness 0-255,
  88. given pixel, and the x and y on the image calls img_point,
  89. to get the individual rgb values to calculate,
  90. brightness. (Grey scale)
  91. """
  92. rgb = img_point(pix, x, y)
  93. # if(im.mode == 'P'):
  94. # rgb = pal[rgb*3:(rgb+1)*3]
  95. # if(im.mode == 'I'):
  96. # return rgb >> 8
  97. return int((rgb[0] + rgb[1] + rgb[2]) / 3)
  98. def is_set(pix, x, y):
  99. global INTENSITY
  100. """
  101. is_set, returns True or False of calculating,
  102. the brightness of the given point on a image,
  103. compared to given intensity.
  104. True means the brightness at the given x and y,
  105. is Less Than which means its dark.
  106. False means the brightness at the given x and y,
  107. is Greater Than which means its bright. (Grey Scale)
  108. """
  109. avg = img_avg(pix, x, y)
  110. return avg < INTENSITY
  111. def is_green(pix, x, y):
  112. """
  113. Is this pixel Green?
  114. """
  115. (red, green, blue, _) = img_point(pix, x, y)
  116. # Find the difference between green and the other values.
  117. other = red
  118. if blue > other:
  119. other = blue
  120. diff = green - other
  121. return diff > GREEN_DIFF
  122. def scan_img(pix, size):
  123. """
  124. scan_img, looks at a image and looks for dark pixels,
  125. if it is a dark pixel record the number and resize the,
  126. returned values to show where the most dark pixels on the,
  127. image are located. (Grey Scale)
  128. given pixel, and image size.
  129. returns start x, y and end x, y and total number of dark pixels.
  130. """
  131. total = 0
  132. sx = size[0]
  133. ex = 0
  134. sy = size[1]
  135. ey = 0
  136. for y in range(0, size[1]):
  137. for x in range(0, size[0]):
  138. pnt_is = is_set(pix, x, y)
  139. if pnt_is:
  140. total += 1
  141. if x < sx:
  142. sx = x
  143. if x > ex:
  144. ex = x
  145. if y < sy:
  146. sy = y
  147. if y > ey:
  148. ey = y
  149. # print (sx,ex,sy,ey)
  150. # give us a little border to work with
  151. if sx > 0:
  152. sx -= 1
  153. if ex < size[0]:
  154. ex += 1
  155. if sy > 0:
  156. sy -= 1
  157. if ey < size[1]:
  158. ey += 1
  159. # print (sx,ex,sy,ey)
  160. return (sx, sy, ex, ey, total)
  161. def output_image(pix, size):
  162. """
  163. For the size of the area we have reduced down to where the majority of dark pixels,
  164. are located, store all that into a list and return the list.
  165. given pixel for function passing.
  166. returns multiple strings in a list that are edited to use characters to represent,
  167. the dark and light pixels of the image. (Grey Scale)
  168. """
  169. result = []
  170. ex = size[0]
  171. sx = 0
  172. ey = size[1]
  173. sy = 0
  174. for y in range(sy, ey):
  175. s = ""
  176. for x in range(sx, ex):
  177. # if is_set(pix, x, y):
  178. if not is_green(pix, x, y):
  179. s += ON
  180. else:
  181. s += OFF
  182. result.append(s)
  183. return result
  184. def image_filename(difficulty, index):
  185. return f"images/{difficulty}_{index}.png"
  186. def cleaned_filename(difficulty, index):
  187. return f"images/{difficulty}_{index}_clean.png"
  188. def image_cleaner(source, destination):
  189. image = Image.open(source)
  190. # pixels = image.load()
  191. size = image.size
  192. print(f"Size: {size[0]} x {size[1]}")
  193. for y in range(0, size[1]):
  194. s = ""
  195. for x in range(0, size[0]):
  196. (r, g, b, _) = image.getpixel( (x,y) )
  197. high = r
  198. if b > high:
  199. high = b
  200. diff = g - high
  201. is_green = diff > GREEN_DIFF
  202. if is_green:
  203. image.putpixel( (x,y), (255,255,255,255) )
  204. else:
  205. image.putpixel( (x,y), (0, 0, 0, 255) )
  206. # if is_set(pix, x, y):
  207. # if not is_green(pix, x, y):
  208. # result.append(s)
  209. image.save(destination)
  210. def run(difficult, index):
  211. """
  212. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  213. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  214. (not all of the categories have 70 and thus we print a File does not exist)
  215. We open and load the image, and get it's size,
  216. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  217. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  218. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  219. """
  220. <<<<<<< Updated upstream
  221. fname = f"images/{difficult}_{x}.png"
  222. if not os.path.exists(fname):
  223. print("Could not find '{0}'".format(fname))
  224. return False # We did not complete
  225. =======
  226. for x in range(0, 70):
  227. fname = image_filename(difficult, x)
  228. if not os.path.exists(fname):
  229. # print("Could not find '{0}'".format(fname))
  230. # continue
  231. # We've reached the end, so stop looking. :P
  232. break
  233. >>>>>>> Stashed changes
  234. print(f"Loading: {fname}")
  235. im = Image.open(fname)
  236. pix = im.load()
  237. size = im.size
  238. print(f"Size: {size[0]} x {size[1]}")
  239. pal = im.getpalette()
  240. sx = 0
  241. ex = size[0]
  242. sy = 0
  243. ey = size[1]
  244. total = 0
  245. sx, sy, ex, ey, total = scan_img(pix, size)
  246. print(f"Chars within ({sx}, {sy}) - ({ex}, {ey}) total {total} pixels")
  247. img_s = output_image(pix, size)
  248. for l in img_s:
  249. print(l)
  250. word = input("Word: ")
  251. # Returns word so it can be stored in dictonary
  252. return word
  253. #print(f"Image saved to '{DIR}/{difficult}_{x}.txt' in byte string")
  254. # os.remove(f'{fname}') # Grr No bad bean, keep file for error checking
  255. # print(f"File '{fname}' automatically removed")
  256. <<<<<<< Updated upstream
  257. def autotrain(difficult, index):
  258. =======
  259. key_word = {}
  260. def autotrain(difficult):
  261. >>>>>>> Stashed changes
  262. """
  263. run, represents a single execution of components to the image, (Actuall we do it 1 category at a time instead of just 1 single execution )
  264. those components do the following... (Each category has around 70 items so we standardize on 70, but )
  265. (not all of the categories have 70 and thus we print a File does not exist)
  266. We open and load the image, and get it's size,
  267. then we scan_img for dark and light pixels, <-- This narrows the image down to just the majority of dark pixels
  268. then from that we output the image line by line onto the screen after it has been output_image d into list form,
  269. Where we ask the user what the word is, and after that we save all that to a file in the data directory.
  270. """
  271. <<<<<<< Updated upstream
  272. # Re aranged the code so I can have it return after each word
  273. fname = f"images/{difficult}_{x}.png"
  274. if not os.path.exists(fname):
  275. print("Could not find '{0}'".format(fname))
  276. return False # We did not complete
  277. print(f"Loading: {fname}")
  278. fileout = "data/{0}_{1}".format(difficult, x)
  279. output = subprocess.run(
  280. ["tesseract", fname, fileout],
  281. stderr=subprocess.DEVNULL,
  282. # capture_output=False,
  283. shell=False,
  284. )
  285. with open(fileout + ".txt", "r") as fp:
  286. word = fp.read().strip()
  287. print(word)
  288. return word # Save this to the dict
  289. =======
  290. for x in range(0, 70):
  291. fname = image_filename(difficult, x)
  292. if not os.path.exists(fname):
  293. break
  294. # print("Could not find '{0}'".format(fname))
  295. # continue
  296. cleaned = cleaned_filename(difficult, x)
  297. if not os.path.exists(cleaned):
  298. image_cleaner(fname, cleaned)
  299. print(f"Loading: {cleaned}")
  300. fileout = "data/{0}_{1}".format(difficult, x)
  301. output = subprocess.run(
  302. ["tesseract", cleaned, fileout],
  303. stderr=subprocess.DEVNULL,
  304. # capture_output=False,
  305. shell=False,
  306. )
  307. with open(fileout + ".txt", "r") as fp:
  308. word = fp.read().strip().lower()
  309. key_word[f'{difficult}_{x}'] = word
  310. print(word)
  311. >>>>>>> Stashed changes
  312. # Now to call all the previous functions
  313. if args.download:
  314. print("Downloading s0urce.io Words")
  315. print("EASY")
  316. # time.sleep(5)
  317. for e in range(0, 62):
  318. download("e", e)
  319. # time.sleep(random.randint(10, 15))
  320. print("MEDIUM")
  321. # time.sleep(5)
  322. for m in range(0, 66):
  323. download("m", m)
  324. # time.sleep(random.randint(10, 15))
  325. print("HARD")
  326. # time.sleep(5)
  327. for h in range(0, 55):
  328. download("h", h)
  329. # time.sleep(random.randint(10, 15))
  330. if args.train:
  331. # Img Processing: Run thru every single category and every single word
  332. wordDict = {}
  333. for level in ["e", "m", "h"]:
  334. <<<<<<< Updated upstream
  335. for x in range(0, 66):
  336. at = autotrain(level, x)
  337. if(at != False): # If it is complete store it
  338. wordDict["{0}_{1}".format(level, x)] = at
  339. with open('words.yml', 'w') as f:
  340. yaml.dump(wordDict, f) # Writes it automatically into the file
  341. =======
  342. autotrain(level)
  343. with open(args.JSON, 'w') as fp:
  344. json.dump(key_word, fp, sort_keys=True, indent=2)
  345. >>>>>>> Stashed changes
  346. # ----------------------------------------------------------------------------------------
  347. # All below was in a seperate dataJS.py file... but now I have fixed it so it's 1 script!
  348. # Do we really need to worry about all this right now? (I think we have enough bugs to begin with.)
  349. JSONME = "false" # Do not execute
  350. if JSONME.lower() != "false":
  351. print("Now exporting to JSON")
  352. print(f"Targeting file: '{JSONME}'")
  353. time.sleep(5)
  354. def test(t):
  355. global DIR
  356. """
  357. given the filename, we read it and add it to a list and return the list.
  358. """
  359. fname = f"{DIR}/{t}.txt"
  360. r = []
  361. try:
  362. with open(fname, "r") as f:
  363. for l in f:
  364. r.append(l.strip())
  365. return r
  366. except FileNotFoundError:
  367. return None
  368. def insertJS(item):
  369. global JSON
  370. """
  371. Edits the file given and adds the JSONIFIED item to the file between 2 indicators,
  372. // T
  373. and
  374. // t
  375. In between the T and t will be replaced with the item.
  376. """
  377. item = json.dumps(item)
  378. item = f"{item},"
  379. r = []
  380. try:
  381. with open(f"{JSONME}", "r") as f:
  382. for l in f:
  383. if l != "":
  384. r.append(l.strip("\n"))
  385. else:
  386. r.append("")
  387. except FileNotFoundError:
  388. print(f"File {JSONME} Not Found!")
  389. sys.exit()
  390. c = 0
  391. for e in r:
  392. if "// T" == e:
  393. temp = r[c + 1]
  394. del r[c + 1]
  395. r.insert(c + 1, item)
  396. r.insert(c + 2, temp)
  397. elif "// t" == e:
  398. break
  399. c += 1
  400. with open(f"{JSONME}", "w") as f:
  401. for e in r:
  402. f.write(f"{e}\n")
  403. for x in range(0, 183):
  404. te = test(x)
  405. if te != None:
  406. word = te
  407. insertJS(word)
  408. # Regardless what we did let the user know we at least ran and we are now done
  409. print("Complete")