{"id":75,"date":"2020-03-11T10:00:01","date_gmt":"2020-03-11T01:00:01","guid":{"rendered":"http:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/?p=75"},"modified":"2020-06-24T14:56:01","modified_gmt":"2020-06-24T05:56:01","slug":"%e3%83%86%e3%82%b9%e3%83%88","status":"publish","type":"post","link":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/?p=75","title":{"rendered":"\u6b63\u898f\u8868\u73fe\u3092\u5229\u7528\u3057\u305fOCR\u30c6\u30ad\u30b9\u30c8\u306e\u30af\u30ea\u30fc\u30cb\u30f3\u30b0\u624b\u6cd5"},"content":{"rendered":"\n<hr class=\"wp-block-separator\"\/>\n\n\n\n<p style=\"background-color:#fcf8e3;color:#8a6d3b\" class=\"has-text-color has-background\"><strong>Laura Turner O&#8217;Hara<\/strong><\/p>\n\n\n\n<p style=\"background-color:#fcf8e3;color:#8a6d3b\" class=\"has-text-color has-background has-small-font-size has-medium-font-size\">\u30b9\u30ad\u30e3\u30f3\u753b\u50cf\u3092\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u306b\u5909\u63db\u3059\u308b\u5149\u5b66\u7684\u6587\u5b57\u8a8d\u8b58\uff08Optical Character Recognition; OCR\uff09\u306f\u3001\u6b74\u53f2\u7814\u7a76\u306b\u3068\u3063\u3066\u5929\u304b\u3089\u306e\u8d08\u308a\u7269\u3067\u3042\u308b\u3053\u3068\u306f\u660e\u3089\u304b\u3067\u3059\u3002\u3053\u306e\u30ec\u30c3\u30b9\u30f3\u3067\u306f\u3001OCR\u3067\u30c6\u30ad\u30b9\u30c8\u5316\u3055\u308c\u305f\u30c7\u30fc\u30bf\u3092\u3088\u308a\u4f7f\u3044\u3084\u3059\u304f\u3059\u308b\u65b9\u6cd5\u3092\u5b66\u3073\u307e\u3059\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u76ee\u6b21<\/h2>\n\n\n\n<ul><li><a href=\"#introduction\">\u306f\u3058\u3081\u306b<\/a><\/li><li><a href=\"#regular-expressions-regex\">\u6b63\u898f\u8868\u73fe\uff08Regex\uff09<\/a><\/li><li><a href=\"#python-and-regex\">Python\u3068\u6b63\u898f\u8868\u73fe<\/a><ul><li><a href=\"#two-things-to-note-before-you-get-started\">\u59cb\u3081\u308b\u524d\u306b\u899a\u3048\u3066\u304a\u304f\u3079\u304d2\u3064\u306e\u3053\u3068<\/a><\/li><li><a href=\"#my-example-python-file\">\u30b5\u30f3\u30d7\u30ebPython\u30d5\u30a1\u30a4\u30eb<\/a><\/li><li><a href=\"#using-verbose-mode\">VERBOSE\u30e2\u30fc\u30c9\u3092\u6d3b\u7528<\/a><\/li><\/ul><\/li><\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"introduction\">\u306f\u3058\u3081\u306b<\/h2>\n\n\n\n<p>\u30b9\u30ad\u30e3\u30f3\u753b\u50cf\u3092\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u306b\u5909\u63db\u3059\u308b\u5149\u5b66\u7684\u6587\u5b57\u8a8d\u8b58\uff08Optical Character Recognition; OCR\uff09\u306f\u3001\u6b74\u53f2\u7814\u7a76\u306b\u3068\u3063\u3066\u5929\u304b\u3089\u306e\u8d08\u308a\u7269\u3067\u3042\u308b\u3053\u3068\u306f\u660e\u3089\u304b\u3067\u3059\u3002\u3053\u306e\u51e6\u7406\u306b\u3088\u308a\u3001\u30c6\u30ad\u30b9\u30c8\u3092\u691c\u7d22\u53ef\u80fd\u306a\u3082\u306e\u306b\u3057\u3064\u3064\u3001\u3088\u308a\u7c21\u5358\u306b\u69cb\u6587\u89e3\u6790\u3057\u30c6\u30ad\u30b9\u30c8\u30de\u30a4\u30cb\u30f3\u30b0\u3092\u884c\u3048\u308b\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002\u3057\u304b\u3057\u3001\u53f2\u6599\u306b\u5bfe\u3057\u3066OCR\u306f\u5b8c\u74a7\u3068\u306f\u7a0b\u9060\u3044\u3053\u3068\u306f\u8ab0\u3082\u304c\u6c17\u4ed8\u3044\u3066\u3044\u308b\u306f\u305a\u3067\u3059\u3002\u53e4\u3044\u66f8\u4f53\u3084\u5f62\u5f0f\u306f\u72ec\u7279\u306eOCR\u3092\u5fc5\u8981\u3068\u3057\u307e\u3059\u3002\u4f8b\u3048\u3070\u3001\u7b2c50\u56de\u5408\u8846\u56fd\u9023\u90a6\u8b70\u4f1a\u306e<em>Congressional Directory<\/em>\uff081887\uff09\u306b\u3042\u308b\u6b21\u306e\u30da\u30fc\u30b8\u3092\u898b\u3066\u304f\u3060\u3055\u3044\u3002<a rel=\"noreferrer noopener\" href=\"http:\/\/home.heinonline.org\/\" target=\"_blank\">HeinOnline<\/a>\uff3b\u8a33\u6ce8\uff1aHeinOnline\u306f\u6cd5\u5b66\u95a2\u4fc2\u8cc7\u6599\u306e\u6709\u511f\u30aa\u30f3\u30e9\u30a4\u30f3\u30d9\u30fc\u30b9\u3002\u30a2\u30e1\u30ea\u30ab\u306e\u5b98\u5831\u7b49\u3082\u63d0\u4f9b\u3057\u3066\u3044\u308b\u3002\uff3d\u304b\u3089\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u305fPDF\u30b9\u30ad\u30e3\u30f3\u306f\u3001\u307e\u3068\u307e\u3063\u3066\u3044\u308b\u3088\u3046\u306b\u898b\u3048\u307e\u3059:<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter is-resized\"><img decoding=\"async\" loading=\"lazy\" src=\"http:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/wp-content\/uploads\/\u56f31.png\" alt=\"\" class=\"wp-image-199\" width=\"580\" height=\"830\"\/><figcaption>\u3053\u308c\u306fPDF\u30da\u30fc\u30b8\u306e\u30b9\u30af\u30ea\u30fc\u30f3\u30b7\u30e7\u30c3\u30c8\u3067\u3059\u3002 <\/figcaption><\/figure><\/div>\n\n\n\n<p>\u3057\u304b\u3057\u3001OCR\u30ec\u30a4\u30e4\u30fc\uff08\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3068\u3057\u3066\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9*\uff09\u3092\u898b\u308b\u3068\u3001\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u304c\u305d\u3053\u307e\u3067\u304d\u308c\u3044\u3067\u306f\u306a\u3044\u3053\u3068\u304c\u5206\u304b\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter is-resized\"><img decoding=\"async\" loading=\"lazy\" src=\"http:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/wp-content\/uploads\/\uff12.png\" alt=\"\" class=\"wp-image-200\" width=\"714\" height=\"902\"\/><figcaption>  \u3053\u308c\u306fOCR\u306e\u30b9\u30af\u30ea\u30fc\u30f3\u30b7\u30e7\u30c3\u30c8\u3067\u3059\u3002 <\/figcaption><\/figure><\/div>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\"><em>\u6ce8: \u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u306e\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u30aa\u30d7\u30b7\u30e7\u30f3\u304c\u306a\u3051\u308c\u3070\u3001<\/em><a rel=\"noreferrer noopener\" href=\"http:\/\/www.unixuser.org\/~euske\/python\/pdfminer\/index.html\" target=\"_blank\"><em>pdfminer<\/em><\/a><em>\u30e2\u30b8\u30e5\u30fc\u30eb\u3092\u4f7f\u3048\u3070PDF\u304b\u3089\u30c6\u30ad\u30b9\u30c8\u3092\u62bd\u51fa\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002<\/em><\/p>\n\n\n\n<p>\u3053\u308c\u3092\u4f7f\u3063\u306619\u4e16\u7d00\u5f8c\u534a\u306e\u56fd\u4f1a\u8b70\u54e1\u306e\u30ef\u30b7\u30f3\u30c8\u30f3\u3067\u306e\u4f4f\u6240\u3092\u30de\u30c3\u30d4\u30f3\u30b0\u3057\u305f\u3044\u306e\u3067\u3059\u304c\u3001\u3053\u306e\u30c7\u30fc\u30bf\u3092\u3088\u308a\u6d3b\u7528\u3057\u3084\u3059\u304f\u3059\u308b\u305f\u3081\u306b\u306f\u3069\u3046\u3059\u308c\u3070\u3088\u3044\u3067\u3057\u3087\u3046\u304b\uff1f<\/p>\n\n\n\n<p>\u7b54\u3048\u306f\u6b63\u898f\u8868\u73fe\u3001\u201dregex\u201d\u3067\u3059\u3002\u6b63\u898f\u8868\u73fe\u306b\u3088\u3063\u3066\u6b21\u306e\u3088\u3046\u306a\u7d50\u679c\u304c\u5f97\u3089\u308c\u307e\u3057\u305f\u3002\u3053\u308c\u306f\u300c\u672c\u5f53\u306e\u300dCSV\u30d5\u30a1\u30a4\u30eb\u3067\u306f\u306a\u3044\u3082\u306e\u306e\uff08\u30ab\u30f3\u30de\u304c\u6b63\u3057\u304f\u3042\u308a\u307e\u305b\u3093\uff09\u3001\u30a8\u30af\u30bb\u30eb\u3067\u7c21\u5358\u306b\u78ba\u8a8d\u3067\u304d\u3001\u30b8\u30aa\u30b3\u30fc\u30c7\u30a3\u30f3\u30b0\u306b\u5411\u3051\u3066\u6e96\u5099\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002\u524d\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3088\u308a\u306f\u305a\u3063\u3068\u826f\u3044\u3067\u3059\u3088\u306d\uff1f<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">\n Aldrich, N. W,Providence, R. I<br>\n Allison, William B, Dubuque, Iowa,24Vermont avenue,<br>\n Bate, William,Nashville, Ten, Ebbitt House<br>\n Beck, James B,Lexington, Ky<br>\n Berry, James I, Bentonville, Ark, National Hotel,<br>\n Blair, I lenry \\V, Manchester, N. H,2o East Capitol stree_._&#8217;<br>\n Blodgett, Rufus,Long Branch, N. J<br>\n Bowen, Thomas M,Del Norte, Colo<br>\n Brown, Joseph E, Atlanta, Ga, Woodmont Flats,<br>\n Butler, M. C,Edgefield, S. C, 1751 P street NW<br>\n Call, Wilkinson, Jacksonville, Fla, 1903 N street NW<br>\n Cameron, J. D,Harrisburg, Pa, 21 Lafayette Square,<br>\n Chace, Jonathan,Providence, R, I<br>\n Chandler, William E, Concord, N. H, 1421 I street NW<br>\n Cockrell, Francis M,Warrensburgh,Mo, I518 R street NW<br>\n Coke, Richard,Waco, Tex, 419 Sixth street NW<br>\n Colquitt, Alfred I I,Atlanta, Ga, 920 New York avenue<br>\n Cullom, Shelby M,Springfield, Ill, 1402 Massachusetts avenue<br>\n Daniel, John W,,Lynchburgh, Va, I7OO Nineteenth st. NW<br>\n Davis, Cushman K, Saint Paul, Minn, 17oo Fifteenth street NW<br>\n Dawes, Henry L,Pittsfield, Mass, 1632Rhode Island avenue.<br>\n Dolph, Joseph N,Portland, Oregon, 8 Lafayette Square,<br>\n Edmunds, George F, Burlington, Vt, 2111 Massachusetts avenue<br>\n Eustis, James B,,New Orleans, La, 1761 N street NW<br>\n Evarts, William M,New York, N. Y, i6oi K street NW<br>\n Farwell, Charles B, Chicago, Ill,<br>\n Faulkner, Charles James, Martinsburgh, W. Va,<br>\n Frye, William P,Lewiston, Me, Hamilton House,<br>\n George, James Z,Jackson, Miss, Metropolitan Hotel<br>\n Gibson, Randall Lee, New Orleans, La, 1723 Rhode Island avenue.<br>\n Gorman, Arthur P, Laurel, Md .,1403 K street NW<br>\n Gray, George,Wilmington, Del,<br>\n Hale, Eugene,Ellsworth, Me, 917 Sixthteenth st. NW<br>\n Hampton, Wade, Columbia, S. C,<br>\n Harris, Isham G, Memphis,Tenn, 13 First street NE<br>\n Hawley, Joseph R,Hartford, Corn, 1514 K street NW<br>\n Hearst, George,San Francisco, Cal,<br>\n Hiscock, Frank, Syracuse, N. Y, Arlington Hotel<br>\n Hoar, George F, Worcester, Mass, 1325 K street NW<br>\n Ingalls, John James, Atchison, Kans, I B street NW<br>\n Jones, James K,Washington, Ark, 915 M street NW<br>\n Jones, John P,Gold Hill, Nev<br>\n Kenna, John E,Charleston, W. Va, 14o B street NW<br>\n McPherson, John ,Jersey City, N. J, 1014 Vermont avenue,<br>\n Manderson, CharlesF. Omaha, Nebr,The Portland<br>\n Morgan, John T,.Selma, Ala,I 13 First street NE<br>\n Morrill, Justin S, Stratford, Vt, x Thomas Circle <br><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"regular-expressions-regex\">\u6b63\u898f\u8868\u73fe\uff08Regex\uff09<\/h2>\n\n\n\n<p>\u6b63\u898f\u8868\u73fe\u306f\u3001\u3044\u308f\u3086\u308b\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u8a00\u8a9e\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002\u305d\u3046\u3067\u306f\u306a\u304f\u3001\u69d8\u3005\u306a\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u8a00\u8a9e\u3067\u7528\u3044\u3089\u308c\u3066\u3044\u308b\u69cb\u6587\u306b\u5f93\u3063\u3066\u304a\u308a\u3001\u4e00\u9023\u306e\u6587\u5b57\u3092\u4f7f\u3044\u3001\u30c6\u30ad\u30b9\u30c8\u5185\u306e\u6b63\u78ba\u306a\u30d1\u30bf\u30fc\u30f3\u3092\u898b\u3064\u3051\u305f\u308a\u7f6e\u63db\u3092\u3057\u305f\u308a\u3057\u307e\u3059\u3002\u4f8b\u3048\u3070\u3001\u3053\u306e\u30b5\u30f3\u30d7\u30eb\u30c6\u30ad\u30b9\u30c8\u3092\u4f7f\u3063\u3066\u307f\u307e\u3057\u3087\u3046:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">Let&#8217;s get all this bad OCR and $tuff. Gr8!<\/p>\n\n\n\n<p>1. \u6b21\u306e\u6b63\u898f\u8868\u73fe\u3092\u4f7f\u3048\u3070\u3001\u5168\u3066\u306e\u5927\u6587\u5b57\uff08L\u3001O\u3001C\u3001R\u3001G\uff09\u3092\u6307\u5b9a\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">[A-Z]<\/p>\n\n\n\n<p>2. \u6b21\u306e\u6b63\u898f\u8868\u73fe\u3092\u4f7f\u3048\u3070\u3001\u6700\u521d\u306e\u5927\u6587\u5b57\uff08L\uff09\u3060\u3051\u3092\u6307\u5b9a\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">^[A-Z]<\/p>\n\n\n\n<p>3. \u6b21\u306e\u6b63\u898f\u8868\u73fe\u3092\u4f7f\u3048\u3070\u3001<strong>\u5927\u6587\u5b57\u4ee5\u5916\u306e<\/strong>\u5168\u3066\u306e\u6587\u5b57\u3092\u6307\u5b9a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">[^A-Z]<\/p>\n\n\n\n<p>4. \u6b21\u306e\u6b63\u898f\u8868\u73fe\u3067\u3001\u7565\u8a9e\u300cOCR\u300d\u3092\u6307\u5b9a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">[A-Z]{3}<\/p>\n\n\n\n<p>5. \u6b21\u306e\u6b63\u898f\u8868\u73fe\u3092\u4f7f\u3063\u3066\u3001\u53e5\u8aad\u70b9\u3092\u6307\u5b9a\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">[[:punct:]]<\/p>\n\n\n\n<p>6. \u6b21\u306b\u793a\u3059\u65b9\u6cd5\u3067\u3001\u5168\u3066\u306e\u53e5\u8aad\u70b9\u3001\u30b9\u30da\u30fc\u30b9\u3001\u6570\u5b57\u3092\u9694\u96e2\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059:<\/p>\n\n\n\n<p class=\"has-background has-very-light-gray-background-color\">[[:punct:], ,0-9]<\/p>\n\n\n\n<p>\u3000\u4f7f\u7528\u3059\u308b\u6587\u5b57\u30bb\u30c3\u30c8\u306f\u3042\u307e\u308a\u591a\u304f\u306f\u3042\u308a\u307e\u305b\u3093\u304c\u3001\u30d1\u30bf\u30fc\u30f3\u304c\u8907\u96d1\u306b\u306a\u308b\u3053\u3068\u304c\u3042\u308a\u307e\u3059\u3002\u307e\u305f\u305d\u308c\u4ee5\u4e0a\u306b\u3001\u914d\u7f6e\u3059\u308b\u5834\u6240\u306b\u3088\u3063\u3066\u6587\u5b57\u304c\u7570\u306a\u308b\u610f\u5473\u3068\u306a\u308b\u3053\u3068\u3082\u3042\u308a\u3048\u307e\u3059\u3002\u4e0a\u8a18\u306e\u4f8b2\u3068\u4f8b3\u306e\u9055\u3044\u3092\u4f8b\u306b\u6319\u3052\u3066\u307f\u307e\u3057\u3087\u3046\u3002\u4f8b2\u3067\u306f\u3001\u8131\u5b57\u7b26\u53f7 (\\^) \u306f\u3001\u884c\u307e\u305f\u306f\u6587\u66f8\u306e\u6700\u521d\u306e\u30d1\u30bf\u30fc\u30f3\u3092\u6307\u5b9a\u3059\u308b\u3053\u3068\u3092\u610f\u5473\u3057\u3066\u3044\u307e\u3059\u3002\u3057\u304b\u3057\u3001\u6587\u5b57\u30af\u30e9\u30b9\uff08[]\uff09\u306e\u4e2d\u306b\u8131\u5b57\u7b26\u53f7\u3092\u66f8\u304f\u3068\u3001\u3053\u308c\u3089\u306e\u6587\u5b57\u30bb\u30c3\u30c8\u300c\u4ee5\u5916\u300d\u3068\u3044\u3046\u610f\u5473\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n\n\n\n<p>\u6b63\u898f\u8868\u73fe\u3092\u7406\u89e3\u3059\u308b\u6700\u3082\u3088\u3044\u65b9\u6cd5\u306f\u3001\u6587\u5b57\u304c\u7570\u306a\u308b\u4f4d\u7f6e\u3067\u3069\u306e\u3088\u3046\u306a\u5f79\u76ee\u3092\u679c\u305f\u3059\u306e\u304b\u3092\u77e5\u308a\u3001\u3068\u306b\u304b\u304f\u7df4\u7fd2\u3059\u308b\u3053\u3068\u3067\u3059\u3002\u3044\u308d\u3044\u308d\u8a66\u3059\u3053\u3068\u304c\u6700\u5584\u306e\u5b66\u7fd2\u65b9\u6cd5\u306a\u306e\u3067\u3001\u6b63\u898f\u8868\u73fe\u30c6\u30b9\u30bf\u30fc\u30c4\u30fc\u30eb\u3092\u4f7f\u3044\u3001\u305d\u306e\u69cb\u6587\u3092\u8a66\u3057\u3066\u307f\u308b\u3053\u3068\u3092\u304a\u52e7\u3081\u3057\u307e\u3059\u3002Mac\u30e6\u30fc\u30b6\u30fc\u306a\u3089\u3001<a rel=\"noreferrer noopener\" href=\"http:\/\/krillapps.com\/patterns\/\" target=\"_blank\">Patterns App<\/a>\uff08Mac Store\u3001\u7c73$2.99\uff09\u3068\u3044\u3046\u3001\u30ea\u30a2\u30eb\u30bf\u30a4\u30e0\u3067\u6b63\u898f\u8868\u73fe\u304c\u4f55\u3092\u3057\u3066\u3044\u308b\u304b\u3092\u53ef\u8996\u5316\u3057\u3066\u304f\u308c\u308b\u4fbf\u5229\u306a\u30c4\u30fc\u30eb\u3082\u3042\u308a\u307e\u3059\u3002\u540c\u30a2\u30d7\u30ea\u306b\u306f\u3001\u5404\u30b7\u30f3\u30dc\u30eb\u306e\u5185\u8535\u30c1\u30fc\u30c8\u30b7\u30fc\u30c8\u3082\u4ed8\u3044\u3066\u304d\u307e\u3059\u304c\u3001\u7b46\u8005\u306f\u3069\u3061\u3089\u304b\u3068\u3044\u3046\u3068\u3053\u306e\u6c4e\u7528\u7684\uff08\u3064\u307e\u308a\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u8a00\u8a9e\u3092\u6a2a\u65ad\u7684\u306b\u4f7f\u3048\u308b\uff09<a rel=\"noreferrer noopener\" href=\"http:\/\/www.addedbytes.com\/cheat-sheets\/regular-expressions-cheat-sheet\/\" target=\"_blank\">\u30c1\u30fc\u30c8\u30b7\u30fc\u30c8<\/a>\u304c\u3001\u3088\u308a\u7bc4\u56f2\u306e\u5e83\u3044\u3082\u306e\u3068\u601d\u3044\u307e\u3057\u305f\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"python-and-regex\">Python\u3068\u6b63\u898f\u8868\u73fe<\/h2>\n\n\n\n<p>\u672c\u30c1\u30e5\u30fc\u30c8\u30ea\u30a2\u30eb\u3067\u306f\u3001\u6b63\u898f\u8868\u73fePython\u30e2\u30b8\u30e5\u30fc\u30eb\u3092\u7528\u3044\u3066\u3001\u5148\u307b\u3069\u306e<em>Congressional Directory<\/em>\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u306e\u300c\u30af\u30ea\u30fc\u30f3\u300d\u30d0\u30fc\u30b8\u30e7\u30f3\u3092\u62bd\u51fa\u3057\u3066\u3044\u304d\u307e\u3059\u3002\u3053\u306e\u30e2\u30b8\u30e5\u30fc\u30eb\u306e<a href=\"http:\/\/docs.python.org\/2\/library\/re.html\">\u516c\u5f0f\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8<\/a>\u306f\u8aac\u660e\u304c\u8a73\u7d30\u3067\u3059\u306e\u3067\u3001\u521d\u5fc3\u8005\u306f\u3088\u308a\u30b7\u30f3\u30d7\u30eb\u306a<a rel=\"noreferrer noopener\" href=\"http:\/\/docs.python.org\/2\/howto\/regex.html#regex-howto\" target=\"_blank\">\u6b63\u898f\u8868\u73feHOWTO\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8<\/a>\u3092\u5229\u7528\u3059\u308b\u3068\u3088\u3044\u3067\u3057\u3087\u3046\u3002\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"two-things-to-note-before-you-get-started\"><strong>\u59cb\u3081\u308b\u524d\u306b\u899a\u3048\u3066\u304a\u304f\u3079\u304d2\u3064\u306e\u3053\u3068<\/strong><\/h3>\n\n\n\n<ul><li>\u601d\u3046\u306b\u3001\u3042\u308b\u4e00\u3064\u306e\u6587\u66f8\u3092\u30af\u30ea\u30fc\u30f3\u306b\u3057\u306a\u304f\u3066\u306f\u3044\u3051\u306a\u3044\u306e\u3067\u3042\u308c\u3070\u3001Python\u304c\u6700\u3082\u52b9\u7387\u7684\u306a\u65b9\u6cd5\u3068\u3044\u3046\u308f\u3051\u3067\u306f<em>\u3042\u308a\u307e\u305b\u3093<\/em>\u3002<a rel=\"noreferrer noopener\" href=\"http:\/\/www.gnu.org\/software\/sed\/\" target=\"_blank\">sed<\/a>\u3084<a rel=\"noreferrer noopener\" href=\"http:\/\/www.gnu.org\/software\/grep\/\" target=\"_blank\">grep<\/a>\u306a\u3069\u306e\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u30d7\u30ed\u30b0\u30e9\u30e0\u304c\u3001\u3053\u306e\u7a2e\u306e\u51e6\u7406\u3067\u306f\u3088\u308a\u52b9\u679c\u7684\u3060\u3068\u601d\u3044\u307e\u3059\u3002\uff08\u3053\u308c\u3089\u306e\u30c1\u30e5\u30fc\u30c8\u30ea\u30a2\u30eb\u306e\u4f5c\u6210\u306f\u3001grep\/sed\u30e6\u30fc\u30b6\u30fc\u306b\u4efb\u305b\u307e\u3059\uff09\u3002\u79c1\u304cPython\u3092\u63a1\u7528\u3057\u305f\u7406\u7531\u306f\u3044\u304f\u3064\u304b\u3042\u308a\u307e\u3059\u3002\u307e\u305a\u30011) Python\u306e\u66f8\u304d\u65b9\u3092\u3088\u304f\u7406\u89e3\u3057\u3066\u3044\u308b\u304b\u3089\u3067\u3042\u308a\u3001\u307e\u305f\u30012) \u30df\u30b9\u3092\u7c21\u5358\u306b\u8ffd\u8de1\u3067\u304d\u308b\u3088\u3046\u306b\u30b9\u30c6\u30c3\u30d7\u30921\u30641\u3064\u5358\u4e00\u306e\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u51fa\u3057\u3066\u304f\u308c\u308b\u304b\u3089\u3002\u305d\u3057\u3066\u30013) <em>Congressional      Directory<\/em>\u304b\u3089\u8907\u6570\u30da\u30fc\u30b8\u3092\u304d\u308c\u3044\u306b\u3059\u308b\u305f\u3081\u3001\u4f55\u5ea6\u3082\u7e70\u308a\u8fd4\u3057\u4f7f\u3048\u308b\u30d7\u30ed\u30b0\u30e9\u30e0\u304c\u5fc5\u8981\u3060\u304b\u3089\u3001\u3068\u3044\u3063\u305f\u7406\u7531\u3067\u3059\u3002<\/li><li>\u3053\u306e\u6587\u66f8\u8cc7\u6599\u306b\u5bfe\u3059\u308bOCR\u306f\u4e00\u8cab\u6027\u306e\u3042\u308b\u3082\u306e\u3068\u306f\u8a00\u3048\u307e\u305b\u3093\uff08\u5358\u4e00\u30da\u30fc\u30b8\u5185\u3067\u3082\u8907\u6570\u30da\u30fc\u30b8\u3067\u3082\uff09\u3002\u305d\u306e\u305f\u3081\u3001\u3053\u306e\u30af\u30ea\u30fc\u30cb\u30f3\u30b0\u30c1\u30e5\u30fc\u30c8\u30ea\u30a2\u30eb\u306e\u7d50\u679c\u306f\u5b8c\u74a7\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002<strong>\u3053\u3053\u3067\u306e\u76ee\u6a19\u306f\u3001\u6b63\u898f\u8868\u73fe\u306b\u9762\u5012\u306a\u4f5c\u696d\u3092\u4efb\u305b\u3066\u3001\u958b\u59cb\u6642\u306e\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf<em>\u3088\u308a\u3082<\/em>\u6574\u3063\u305f\u5f62\u5f0f\u3067\u6587\u66f8\u3092\u30a8\u30af\u30b9\u30dd\u30fc\u30c8\u3059\u308b\u3053\u3068\u3067\u3059\u3002<\/strong>\u3053\u308c\u306b\u3088\u308a\u3001\u4f4f\u6240\u30c7\u30fc\u30bf\u306e\u30b8\u30aa\u30b3\u30fc\u30c7\u30a3\u30f3\u30b0\u3092\u884c\u3046\u524d\u306b\u3001\u624b\u4f5c\u696d\u3067\u884c\u308f\u306a\u304f\u3066\u306f\u3044\u3051\u306a\u3044\u524d\u51e6\u7406\u3092\u5927\u5e45\u306b\u8efd\u6e1b\u3057\u307e\u3059\u304c\u3001\u4f5c\u696d\u306e\u5fc5\u8981\u6027\u304c\u306a\u304f\u306a\u308b\u308f\u3051\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002<\/li><\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"my-example-python-file\"><strong>\u30b5\u30f3\u30d7\u30ebPython\u30d5\u30a1\u30a4\u30eb<\/strong><\/h3>\n\n\n\n<p>\u30c7\u30fc\u30bf\u30af\u30ec\u30f3\u30b8\u30f3\u30b0\u306e\u305f\u3081\u306b\u4f5c\u6210\u3057\u305fPython\u30d5\u30a1\u30a4\u30eb\u306f\u4ee5\u4e0b\u306e\u901a\u308a\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted has-background has-very-light-gray-background-color\"> <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#cdocr.py<\/em>\n <em>#HeinOnline\u306e\u30c6\u30ad\u30b9\u30c8\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u304b\u3089\u53e5\u8aad\u70b9\u3092\u53d6\u308a\u9664\u304d\u3001\u60c5\u5831\u3092\u62bd\u51fa\u3059\u308b<\/em>\n &nbsp;\n <em>#\u30e2\u30b8\u30e5\u30fc\u30ebre\u3092\u30a4\u30f3\u30dd\u30fc\u30c8\u3059\u308b<\/em><\/font>\n <strong>import<\/strong> re\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3092\u958b\u304d\u3001\u305d\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3092\u30ea\u30b9\u30c8\u306b\u8aad\u307f\u8fbc\u3080<\/em><\/font>\n <strong>with<\/strong> <font style=\"color:#0000ff\" class=\"has-text-color\">open<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">'..\/..\/data\/txt\/50-1-p1.txt'<\/font>) <strong>as<\/strong> ocr:\n &nbsp;&nbsp;&nbsp; Text <strong>=<\/strong> ocr<strong>.<\/strong>readlines()\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4fee\u6b63\u5f8c\u306e\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u3092\u5165\u308c\u308b\u305f\u3081\u306e\u7a7a\u306e\u30ea\u30b9\u30c8\u3092\u4f5c\u6210\u3059\u308b<\/em><\/font>\n CleanText <strong>=<\/strong> []\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30a4\u30f3\u30dd\u30fc\u30c8\u3057\u305f\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u5185\u306e\u5404\u884c\u306b\u3064\u3044\u3066\u3001\u4ee5\u4e0b\u306e\u5168\u30d1\u30bf\u30fc\u30f3\u306b\u5408\u81f4\u3059\u308b\u304b\u30c1\u30a7\u30c3\u30af\u3059\u308b<\/em><\/font>\n <strong>for<\/strong> line <strong>in<\/strong> Text:\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u8907\u6570\u306e\u30c0\u30c3\u30b7\u30e5\u3092\u542b\u3080\u884c\u306b\u306f\u30c7\u30fc\u30bf\u304c\u542b\u307e\u308c\u3066\u3044\u308b\u3053\u3068\u304b\u3089\u3001\u305d\u308c\u3089\u306e\u884c\u3092\u691c\u7d22\u3059\u308b\u3088\u3046\u306b\u3059\u308b<\/em>\n &nbsp;&nbsp;&nbsp; <em>#--\u306f\u30c6\u30ad\u30b9\u30c8\u5185\u90e8\u306b\u30c0\u30c3\u30b7\u30e5\u4e00\u3064\u3060\u3051\u3092\u3082\u3064\u884c\u306b\u306f\u4e00\u81f4\u3057\u306a\u3044<\/em><\/font>\n &nbsp;&nbsp;&nbsp; dashes <strong>=<\/strong> re<strong>.<\/strong>search(<font style=\"color:#ff0000\" class=\"has-text-color\">'(--+)'<\/font>, line)\n &nbsp;\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c0\u30c3\u30b7\u30e5\u306e\u3042\u308b\u884c\u3092\u4e00\u81f4\u3055\u305b\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp; <strong>if<\/strong> dashes:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c0\u30c3\u30b7\u30e5\u3092\u81ea\u5206\u3067\u9078\u629e\u3057\u305f\u30c7\u30ea\u30df\u30bf\u30fc\u306b\u7f6e\u63db\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nodash <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.(-+)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, line)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u8907\u6570\u306e\u30d4\u30ea\u30aa\u30c9\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nodots <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.(\\.\\.+)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nodash)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4f59\u5206\u306a\u7a7a\u767d\u306e\u500b\u6240\u3092,\u306b\u7f6e\u63db\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nospaces <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'(&nbsp; +)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, nodots)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30a2\u30b9\u30bf\u30ea\u30b9\u30af(*)\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nostar <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.[*]'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nospaces)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u884c\u306e\u5148\u982d\u306b\u6539\u884c\u3084\u30b3\u30f3\u30de\u304c\u3042\u308c\u3070\u305d\u308c\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; flushleft <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'^\\W'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nostar)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4e8c\u91cd\u30b3\u30f3\u30de\u3092\u6d88\u3059\uff08Evarts\u306e\u500b\u6240\uff09<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; comma <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">',{2,3}'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, flushleft)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u5358\u8a9e\u9593\u306e\u7a7a\u767d\u304c\u306a\u3044\u500b\u6240\u3092\u6574\u5f62\u3059\u308b\uff08\u4f8b\u3048\u3070Dawes\u3068Manderson\uff09<\/em>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <em>#\u4f4f\u6240\u306b00\u3068\u3044\u3046\u8868\u8a18\u304c\u3042\u308b\u3068\u3053\u308d\u306f\u3001\u305d\u306e\u4e8c\u91cd\u306e00\u3092\u30b9\u30ad\u30c3\u30d7\u3055\u305b\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; caps <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'[A-N|P-Z]{2,}'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, comma)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30d4\u30ea\u30aa\u30c9\u3092\u53d6\u308a\u9664\u3044\u3066\u3001NE\u3068NW\u306e\u30a4\u30f3\u30c7\u30a3\u30b1\u30fc\u30bf\u3092\u6574\u5f62\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ne <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'(\\,*? N\\. ?E.)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">' NE'<\/font>, caps)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nw <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'(\\,*?N\\. ?W[\\.\\,]*?_?)$'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">' NW'<\/font>, ne) <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#VERBOSE\u5316\u3059\u308b<\/em>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <em>#\u30e9\u30b9\u30c8\u30cd\u30fc\u30e0\u3068\u30d5\u30a1\u30fc\u30b9\u30c8\u30cd\u30fc\u30e0\u306e\u9593\u306b\u3042\u308b\u30d4\u30ea\u30aa\u30c9\u3092\u30b3\u30f3\u30de\u306b\u5909\u63db(Chace, Cockrell\u306e\u500b\u6240)<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; match <strong>=<\/strong> re<strong>.<\/strong>search(<font style=\"color:#ff0000\" class=\"has-text-color\">'^([A-Z][a-z]+\\. )'<\/font>, nw) <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#VERBOSE\u5316\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <strong>if<\/strong> match:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; names <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'\\.'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, nw)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <strong>else<\/strong>:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; names <strong>=<\/strong> nw\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30eb\u30fc\u30d7\u3057\u3066\u3044\u308b\u9593\u3001\u5404\u884c\u3092\u30ea\u30b9\u30c8CleanText\u306b\u8ffd\u52a0\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; CleanText<strong>.<\/strong>append(names)\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u201c\u30d5\u30a7\u30a4\u30af\u201d\u306eCSV\u30d5\u30a1\u30a4\u30eb\u306b\u4fdd\u5b58\u3059\u308b&nbsp;&nbsp; <\/em><\/font>\n <strong>with<\/strong> <font style=\"color:#0000ff\" class=\"has-text-color\">open<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">'cdocr2\/50-1p1.csv'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">'w'<\/font>) <strong>as<\/strong> fcsv:\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#CleanText\u5185\u306e\u5404\u884c\u3092\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u8fbc\u3080<\/em><\/font>\n &nbsp;&nbsp;&nbsp; <strong>for<\/strong> line <strong>in<\/strong> CleanText:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; fcsv<strong>.<\/strong>write(line) <\/pre>\n\n\n\n<p>\u30b3\u30e1\u30f3\u30c8\u3092\u304b\u306a\u308a\u4e01\u5be7\u306b\u4ed8\u3051\u52a0\u3048\u307e\u3057\u305f\u306e\u3067\u3001\u3053\u3053\u3067\u306f\u306a\u305c\u3053\u306e\u3088\u3046\u306a\u30b3\u30fc\u30c9\u306e\u69cb\u9020\u306b\u3057\u305f\u306e\u304b\u3092\u8aac\u660e\u3057\u307e\u3057\u3087\u3046\u3002\u307e\u305f\u3001\u8aad\u307f\u3084\u3059\u304f\u3055\u305b\u308b\u305f\u3081\u3001\u9577\u3044\u6b63\u898f\u8868\u73fe\u3092\u6574\u3048\u308b\u5225\u306e\u65b9\u6cd5\u3082\u7d39\u4ecb\u3057\u307e\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<ul><li><strong>16-22\u884c\u76ee<\/strong>&nbsp;\u2013 \u5143\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3067\u306f\u3001\u30c7\u30fc\u30bf\u304c\u5168\u3066\u8907\u6570\u306e\u30c0\u30c3\u30b7\u30e5\u3092\u542b\u3080\u884c\u306b\u3042\u308b\u70b9\u306b\u6ce8\u610f\u3057\u3066\u304f\u3060\u3055\u3044\u3002\u3053\u3053\u3067\u306e\u30b3\u30fc\u30c9\u306f\u3001\u305d\u308c\u3089\u306e\u884c\u3092\u52b9\u7387\u7684\u306b\u6307\u5b9a\u3057\u3066\u3044\u308b\u306e\u3067\u3059\u3002<a rel=\"noreferrer noopener\" href=\"http:\/\/docs.python.org\/2\/library\/re.html#re.search\" target=\"_blank\">re.search()<\/a>\u95a2\u6570\u3092\u4f7f\u3046\u3053\u3068\u3067\u3001\u8907\u6570\u306e\u30c0\u30c3\u30b7\u30e5\u3092\u542b\u3080\u5168\u3066\u306e\u884c\u3092\u898b\u3064\u3051\u51fa\u3057\u3066\u3044\u307e\u3059\u300220\u884c\u76ee\u306eif\u6587\u306b\u3088\u3063\u3066\u3001\u6b8b\u308a\u306e\u30b3\u30fc\u30c9\u304c\u30c0\u30c3\u30b7\u30e5\u3092\u542b\u3080\u884c\u306e\u307f\u3092\u5bfe\u8c61\u306b\u52d5\u304f\u3088\u3046\u306b\u3057\u3066\u3044\u307e\u3059\u3002\uff08\u3053\u308c\u306b\u3088\u308a\u3001\u7b46\u8005\u304c\u6c42\u3081\u3066\u3044\u308b\u30c7\u30fc\u30bf\u306e\u5f8c\u306b\u7d9a\u3044\u3066\u3044\u308b\u3001\u5e8f\u6587\u3084\u30da\u30fc\u30b8\u756a\u53f7\u306e\u884c\u306a\u3069\u3059\u3079\u3066\u3092\u9664\u5916\u3067\u304d\u308b\u306e\u3067\u3059\uff09\u3002<\/li><li><strong>23-40\u884c\u76ee<\/strong>\u2013 \u3053\u3053\u3067\u306f\u3001\u4f59\u8a08\u306a\u53e5\u8aad\u70b9\u3092\u5168\u3066\u53d6\u308a\u9664\u304d\u3001\u30c7\u30fc\u30bf\u5185\u306e\u5fc5\u8981\u500b\u6240\uff08\u30e9\u30b9\u30c8\u30cd\u30fc\u30e0\u3001\u30d5\u30a1\u30fc\u30b9\u30c8\u30cd\u30fc\u30e0\u3001\u90f5\u4fbf\u756a\u53f7\u3001\u30ef\u30b7\u30f3\u30c8\u30f3\u306e\u4f4f\u6240\uff09\u3092csv\u30d5\u30a1\u30a4\u30eb\u5185\u306e\u7570\u306a\u308b\u30d5\u30a3\u30fc\u30eb\u30c9\u306b\u79fb\u3059\u3068\u3044\u3046\u9577\u3044\u51e6\u7406\u3092\u884c\u3063\u3066\u3044\u307e\u3059\u3002<a rel=\"noreferrer noopener\" href=\"http:\/\/docs.python.org\/2\/library\/re.html#re.sub\" target=\"_blank\">re.sub()<\/a>\u95a2\u6570\u3092\u4f7f\u3046\u3053\u3068\u3067\u3001\u30d1\u30bf\u30fc\u30f3\u3092\u5225\u306e\u6587\u5b57\u306b\u7f6e\u304d\u63db\u3048\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002\u30b3\u30e1\u30f3\u30c8\u3092\u4e01\u5be7\u306b\u4ed8\u3051\u3066\u3044\u308b\u306e\u3067\u3001\u305d\u308c\u305e\u308c\u306e\u500b\u6240\u304c\u4f55\u3092\u3084\u3063\u3066\u3044\u308b\u306e\u304b\u304c\u5206\u304b\u308b\u3067\u3057\u3087\u3046\u3002\u3053\u308c\u306f\u6700\u3082\u52b9\u7387\u7684\u306a\u3084\u308a\u304b\u305f\u3067\u306f\u306a\u3044\u304b\u3082\u3057\u308c\u307e\u305b\u3093\u304c\u3001\u3053\u308c\u30921\u3064\u305a\u3064\u3084\u3063\u3066\u3044\u304f\u3053\u3068\u3067\u3001\u4f5c\u696d\u3092\u9032\u3081\u3066\u3044\u304d\u306a\u304c\u3089\u7d50\u679c\u3092\u78ba\u8a8d\u3067\u304d\u308b\u306e\u3067\u3059\u3002\u79c1\u306f\u30eb\u30fc\u30d7\u3092\u69cb\u7bc9\u3057\u306a\u304c\u3089\u3001\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u306b\u5909\u6570\u3092\u8868\u793a\u3055\u305b\u3066\u5404\u30b9\u30c6\u30c3\u30d7\u3092\u78ba\u8a8d\u3057\u3066\u3044\u307e\u3057\u305f\u3002\u30d5\u30a1\u30a4\u30eb\u3092\u30b3\u30de\u30f3\u30c9\u30e9\u30a4\u30f3\u3067\u5b9f\u884c\u3059\u308b\u524d\u306b\u3001\u4f8b\u3048\u307024\u884c\u76ee\uff08\u30c0\u30c3\u30b7\u30e5\u3092\u9664\u53bb\u3057\u3066\u3044\u308b\u500b\u6240\uff09\u306e\u5f8c\u306b\u3001\uff08if\u30eb\u30fc\u30d7\u5185\u3067\uff09\u300cprint nodash\u300d\u3092\u8ffd\u52a0\u3057\u3066\u307f\u308b\u3053\u3068\u3082\u3067\u304d\u307e\u3059\u3002\u305d\u3046\u3059\u308c\u3070\u3001\u66f4\u65b0\u3057\u305f\u3044\u3082\u306e\u3060\u3051\u3092\u66f4\u65b0\u3057\u3001<em>\u671b\u3093\u3067\u3044\u306a\u3044<\/em>\u5909\u66f4\u304c\u884c\u308f\u308c\u3066\u3044\u306a\u3044\u3088\u3046\u3001\u5404\u30b9\u30c6\u30c3\u30d7\u3092\u78ba\u8a8d\u3067\u304d\u308b\u306e\u3067\u3059\u3002<\/li><li><strong>41-46\u884c\u76ee<\/strong> &#8211; \u3053\u3053\u3067\u306f\u5c11\u3057\u9055\u3046\u30e1\u30bd\u30c3\u30c9\u3092\u4f7f\u3063\u3066\u3044\u307e\u3059\u3002OCR\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3067\u306f\u3001\u4e00\u90e8\u306e\u4eba\u540d\u3092\u30d4\u30ea\u30aa\u30c9\u306b\u3057\u3066\u3057\u307e\u3063\u3066\u3044\u307e\u3059\uff08\u4f8b\u3048\u3070\u3001Chace.Jonathan\u3068Chase,Jonathan\u306a\u3069\uff09\u3002\u3053\u306e\u30d1\u30bf\u30fc\u30f3\u3067\u51fa\u73fe\u3059\u308b\u30d4\u30ea\u30aa\u30c9\u3092\u5168\u3066\u30ab\u30f3\u30de\u3078\u3068\u5909\u66f4\u3057\u305f\u3044\u306e\u3067\u3059\u3002\u305d\u3053\u3067\u3001\u884c\u306e\u982d\uff08\uffe5^\uff09\u3092\u898b\u3066\u3001\u5927\u6587\u5b57\u304c1\u3064\u3068\u305d\u306e\u3042\u3068\u306b\u5c0f\u6587\u5b57\u304c\u8907\u6570\u3001\u305d\u3057\u3066\u30d4\u30ea\u30aa\u30c9\u3068\u7d9a\u304f\u3001\u300c^([A-Z][a-z]+\\.)\u300d\u3068\u3044\u3046\u30d1\u30bf\u30fc\u30f3\u3092\u63a2\u3057\u51fa\u3057\u3066\u3044\u307e\u3059\u3002\u30d1\u30bf\u30fc\u30f3\u3067\u6307\u5b9a\u3057\u305f\u5f8c\u306f\u3001\u305d\u306e\u30d1\u30bf\u30fc\u30f3\u306b\u4e00\u81f4\u3059\u308b\u884c\u306e\u30d4\u30ea\u30aa\u30c9\u3092\u30ab\u30f3\u30de\u306b\u7f6e\u304d\u63db\u3048\u308b\u51e6\u7406\u3092\u3057\u3066\u3044\u307e\u3059\u3002<\/li><\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"using-verbose-mode\">VERBOSE\u30e2\u30fc\u30c9\u3092\u6d3b\u7528<\/h3>\n\n\n\n<p>\u307b\u3068\u3093\u3069\u306e\u6b63\u898f\u8868\u73fe\u306f\u8aad\u307f\u306b\u304f\u3044\u3067\u3059\u3002\u3057\u304b\u3057\u300139\u884c\u76ee\u306840\u884c\u76ee\u306f<em>\u7279\u306b<\/em>\u8aad\u307f\u3065\u3089\u3044\u306e\u3067\u306f\u306a\u3044\u3067\u3057\u3087\u3046\u304b\u3002\u3042\u306a\u305f\u306e\u30b3\u30fc\u30c9\u3092\u8aad\u3080\u4eba\u306e\u305f\u3081\uff08\u3042\u308b\u3044\u306f\u671d\u306e2\u6642\u306b\u30b3\u30fc\u30c9\u3092\u773a\u3081\u3066\u3044\u308b\u3042\u306a\u305f\u81ea\u8eab\u306e\u305f\u3081\uff09\u306b\u3082\u3001\u3053\u308c\u3089\u306e<a>\u30d1\u30bf\u30fc\u30f3<\/a>\u3092\u3069\u306e\u3088\u3046\u306b\u304d\u308c\u3044\u306b\u3067\u304d\u308b\u3067\u3057\u3087\u3046\u304b\uff1f\u30e2\u30b8\u30e5\u30fc\u30eb\u306e<a rel=\"noreferrer noopener\" href=\"http:\/\/docs.python.org\/2\/library\/re.html#re.VERBOSE\" target=\"_blank\">VERBOSE\u30e2\u30fc\u30c9<\/a>\u3092\u4f7f\u3048\u3070\u3088\u3044\u306e\u3067\u3059\u3002\u30d1\u30bf\u30fc\u30f3\u3092VERBOSE\u30e2\u30fc\u30c9\u306b\u3059\u308b\u3053\u3068\u3067\u3001python\u304c\u30b9\u30da\u30fc\u30b9\u3084#\u8a18\u53f7\u3092\u7121\u8996\u3057\u3066\u304f\u308c\u308b\u306e\u3067\u3001\u30d1\u30bf\u30fc\u30f3\u3092\u8907\u6570\u306e\u884c\u306b\u5206\u5272\u3057\u3066\u305d\u308c\u305e\u308c\u306b\u30b3\u30e1\u30f3\u30c8\u3092\u4ed8\u3051\u52a0\u3048\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<strong><em>\u305f\u3060\u3057\u3001\u30b9\u30da\u30fc\u30b9\u3092\u7121\u8996\u3059\u308b\u305f\u3081\u3001\u30b9\u30da\u30fc\u30b9\u304c\u3042\u306a\u305f\u306e\u30d1\u30bf\u30fc\u30f3\u306e\u4e00\u90e8\u3067\u3042\u308b\u5834\u5408\u306f\u30d0\u30c3\u30af\u30b9\u30e9\u30c3\u30b7\u30e5\uff08\\\uff09\u3067\u30a8\u30b9\u30b1\u30fc\u30d7\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u3053\u3068\u3092\u5fd8\u308c\u306a\u3044\u3067\u304f\u3060\u3055\u3044\u3002\u307e\u305f\u3001re.VERBOSE\u3068re.X\u306f\u540c\u3058\u3067\u3042\u308b\u70b9\u306b\u3082\u7559\u610f\u3057\u3066\u304f\u3060\u3055\u3044\u3002<\/em><\/strong><\/p>\n\n\n\n<p>VERBOSE\u30e2\u30fc\u30c9\u3067\u66f8\u3044\u305f\u5834\u5408\u306e39\u884c\u76ee\u306840\u884c\u76ee\u306f\u4ee5\u4e0b\u306e\u901a\u308a\u3067\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted has-background has-very-light-gray-background-color\"> <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#(\\,*? N\\. ?E.)\u306b\u3064\u3044\u3066\u306f\u540c\u69d8<\/em>\n <em>#VERBOSE\u30e2\u30fc\u30c9\u3067\u306f\u3059\u3079\u3066\u306e\u30b9\u30da\u30fc\u30b9\u3092\u30a8\u30b9\u30b1\u30fc\u30d7\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059<\/em><\/font>\n ne_pattern <strong>=<\/strong> re<strong>.<\/strong><font style=\"color:#0000ff\" class=\"has-text-color\">compile<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">r'''\n &nbsp;&nbsp;&nbsp; (&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #start group\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\,*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for comma (escaped); *? = 0 or more commas with fewest results\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ N\\.?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for (escaped) space + N that might have an (escaped) period after it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ ?E&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for an E that may or may not have an space in front of it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; .&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #the E might be followed by another character.\n &nbsp;&nbsp;&nbsp; )&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #close group\n &nbsp;&nbsp;&nbsp; $&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #ONLY look at the end of a line\n '''<\/font>, re<strong>.<\/strong>VERBOSE)\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#(\\,*? N\\. ?W[\\.\\,]*?_?)$\u306b\u3064\u3044\u3066\u3082\u540c\u69d8<\/em><\/font>\n nw_pattern <strong>=<\/strong> re<strong>.<\/strong><font style=\"color:#0000ff\" class=\"has-text-color\">compile<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">r'''\n &nbsp;&nbsp;&nbsp; (&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #start group\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\,*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for comma (escaped); *? = 0 or more commas with fewest results\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ N\\.?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for (escaped) space + N that might have an (escaped) period after it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ ?W&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for an W that may or may not have an space in front of it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [\\.\\,]*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for commas or periods (both escaped) that might come after W\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for underscore that comes after one of these NW quadrant indicators\n &nbsp;&nbsp;&nbsp; ) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#close group\n &nbsp;&nbsp;&nbsp; $&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #ONLY look at the end of a line\n '''<\/font>, re<strong>.<\/strong>X) <\/pre>\n\n\n\n<p>\u4e0a\u8a18\u306e\u4f8b\u3067\u306f\u3001<a rel=\"noreferrer noopener\" href=\"http:\/\/docs.python.org\/2\/library\/re.html#re.compile\" target=\"_blank\">re.compile()<\/a>\u95a2\u6570\u3092\u4f7f\u3063\u3066\u4eca\u5f8c\u3082\u4f7f\u3048\u308b\u3088\u3046\u306b\u30d1\u30bf\u30fc\u30f3\u3092\u4fdd\u5b58\u3057\u3066\u3044\u307e\u3059\u3002VERBOSE\u30e2\u30fc\u30c9\u3092\u5229\u7528\u3057\u3066\u4f5c\u6210\u3057\u305fpython\u30b3\u30fc\u30c9\u5168\u4f53\u306f\u6b21\u306e\u901a\u308a\u3068\u306a\u308a\u307e\u3059:\u3053\u3053\u3067\u300117-39\u884c\u3067VERBOSE\u30e2\u30fc\u30c9\u306e\u30d1\u30bf\u30fc\u30f3\u3092\u5b9a\u7fa9\u3057\u3001\u5909\u6570\uff08ne_pattern\u3068nw_pattern\uff09\u306b\u4fdd\u5b58\u3057\u3066\u3044\u308b\u70b9\u306b\u6ce8\u610f\u3057\u3066\u304f\u3060\u3055\u3044\u3002\u3053\u308c\u3089\u306e\u5909\u6570\u306f65\u884c\u306866\u884c\u76ee\u306e\u30eb\u30fc\u30d7\u5185\u3067\u7528\u3044\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted has-background has-very-light-gray-background-color\"><font style=\"color:#aa9857\" class=\"has-text-color\">\n <em>#cdocrverbose.py<\/em>\n <em>#HeinOnline\u306e\u30c6\u30ad\u30b9\u30c8\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u304b\u3089\u53e5\u8aad\u70b9\u3092\u53d6\u308a\u9664\u304d\u3001\u60c5\u5831\u3092\u62bd\u51fa\u3059\u308b<\/em>\n\n <em>#\u30e2\u30b8\u30e5\u30fc\u30ebre\u3092\u30a4\u30f3\u30dd\u30fc\u30c8\u3059\u308b<\/em><\/font>\n <strong>import<\/strong> re\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3092\u958b\u304d\u3001\u305d\u306e\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u3092\u30ea\u30b9\u30c8\u306b\u8aad\u307f\u8fbc\u3080<\/em><\/font>\n <strong>with<\/strong> <font style=\"color:#0000ff\" class=\"has-text-color\">open<\/font>(<font style=\"color:#FF0000\" class=\"has-text-color\">'..\/..\/data\/txt\/50-1-p1.txt'<\/font>) <strong>as<\/strong> ocr:\n &nbsp;&nbsp;&nbsp; Text <strong>=<\/strong> ocr<strong>.<\/strong>readlines()\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4fee\u6b63\u5f8c\u306e\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u3092\u5165\u308c\u308b\u305f\u3081\u306e\u7a7a\u306e\u30ea\u30b9\u30c8\u3092\u4f5c\u6210\u3059\u308b<\/em><\/font>\n CleanText <strong>=<\/strong> []\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>##\u5f8c\u3067\u4f7f\u3046\u305f\u3081\u306b\u8907\u96d1\u306a\u90e8\u5206\u306eVERBOSE\u30d1\u30bf\u30fc\u30f3\u3092\u4f5c\u6210\u3059\u308b##<\/em>\n &nbsp;\n <em>#(\\,*? N\\. ?E.)\u3068\u540c\u3058<\/em>\n <em>#VERBOSE\u30e2\u30fc\u30c9\u3067\u306f\u3059\u3079\u3066\u306e\u30b9\u30da\u30fc\u30b9\u3092\u30a8\u30b9\u30b1\u30fc\u30d7\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059<\/em><\/font>\n ne_pattern <strong>=<\/strong> re<strong>.<\/strong><font style=\"color:#0000ff\" class=\"has-text-color\">compile<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">r'''\n &nbsp;&nbsp;&nbsp; (&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #start group\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\,*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for comma (escaped); *? = 0 or more commas with fewest results\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ N\\.?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for (escaped) space + N that might have an (escaped) period after it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ ?E&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for an E that may or may not have an space in front of it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; .&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #the E might be followed by another character.\n &nbsp;&nbsp;&nbsp; )&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #close group\n &nbsp;&nbsp;&nbsp; $&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #ONLY look at the end of a line\n '''<\/font>, re<strong>.<\/strong>VERBOSE)\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#(\u00a5,*? N\u00a5. ?W[\u00a5.\u00a5,]*?_?)$\u306b\u3064\u3044\u3066\u3082\u540c\u69d8<\/em><\/font>\n nw_pattern <strong>=<\/strong> re<strong>.<\/strong><font style=\"color:#0000ff\" class=\"has-text-color\">compile<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">r'''\n &nbsp;&nbsp;&nbsp; (&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #start group\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\,*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for comma (escaped); *? = 0 or more commas with fewest results\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ N\\.?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for (escaped) space + N that might have an (escaped) period after it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \\ ?W&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for an W that may or may not have an space in front of it\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [\\.\\,]*?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for commas or periods (both escaped) that might come after W\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _?&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #look for underscore that comes after one of these NW quadrant indicators\n &nbsp;&nbsp;&nbsp; ) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#close group\n &nbsp;&nbsp;&nbsp; $&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #ONLY look at the end of a line\n '''<\/font>, re<strong>.<\/strong>VERBOSE)\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em># \u30a4\u30f3\u30dd\u30fc\u30c8\u3057\u305f\u30c6\u30ad\u30b9\u30c8\u30d5\u30a1\u30a4\u30eb\u5185\u306e\u5404\u884c\u306b\u3064\u3044\u3066\u3001\u4ee5\u4e0b\u306e\u5168\u30d1\u30bf\u30fc\u30f3\u306b\u5408\u81f4\u3059\u308b\u304b\u30c1\u30a7\u30c3\u30af\u3059\u308b<\/em><\/font>\n <strong>for<\/strong> line <strong>in<\/strong> Text:\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u8907\u6570\u306e\u30c0\u30c3\u30b7\u30e5\u3092\u542b\u3080\u884c\u306b\u306f\u30c7\u30fc\u30bf\u304c\u542b\u307e\u308c\u3066\u3044\u308b\u3053\u3068\u304b\u3089\u3001\u305d\u308c\u3089\u306e\u884c\u3092\u691c\u7d22\u3059\u308b\u3088\u3046\u306b\u3059\u308b<\/em>\n &nbsp;&nbsp;&nbsp; <em>#--\u306f\u30c6\u30ad\u30b9\u30c8\u5185\u90e8\u306b\u30c0\u30c3\u30b7\u30e5\u4e00\u3064\u3060\u3051\u3092\u3082\u3064\u884c\u306b\u306f\u4e00\u81f4\u3057\u306a\u3044<\/em><\/font>\n &nbsp;&nbsp;&nbsp; dashes <strong>=<\/strong> re<strong>.<\/strong>search(<font style=\"color:#ff0000\" class=\"has-text-color\">'(--+)'<\/font>, line)\n &nbsp;\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c0\u30c3\u30b7\u30e5\u306e\u3042\u308b\u884c\u3092\u4e00\u81f4\u3055\u305b\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp; <strong>if<\/strong> dashes:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30c0\u30c3\u30b7\u30e5\u3092\u81ea\u5206\u3067\u9078\u629e\u3057\u305f\u30c7\u30ea\u30df\u30bf\u30fc\u306b\u7f6e\u63db\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nodash <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.(-+)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, line)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u8907\u6570\u306e\u30d4\u30ea\u30aa\u30c9\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nodots <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.(\u00a5.\u00a5.+)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nodash)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4f59\u5206\u306a\u7a7a\u767d\u306e\u500b\u6240\u3092,\u306b\u7f6e\u63db\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nospaces <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'(&nbsp; +)'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, nodots)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30a2\u30b9\u30bf\u30ea\u30b9\u30af(*)\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nostar <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'.[*]'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nospaces)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u884c\u306e\u5148\u982d\u306b\u6539\u884c\u3084\u30b3\u30f3\u30de\u304c\u3042\u308c\u3070\u305d\u308c\u3092\u6d88\u3059<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; flushleft <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'^\\W'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">''<\/font>, nostar)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u4e8c\u91cd\u30b3\u30f3\u30de\u3092\u6d88\u3059\uff08Evarts\u306e\u500b\u6240\uff09<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; comma <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">',{2,3}'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, flushleft)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u5358\u8a9e\u9593\u306e\u7a7a\u767d\u304c\u306a\u3044\u500b\u6240\u3092\u6574\u5f62\u3059\u308b\uff08\u4f8b\u3048\u3070Dawes\u3068Manderson\uff09<\/em>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <em>#\u4f4f\u6240\u306b00\u3068\u3044\u3046\u8868\u8a18\u304c\u3042\u308b\u3068\u3053\u308d\u306f\u3001\u305d\u306e\u4e8c\u91cd\u306e00\u3092\u30b9\u30ad\u30c3\u30d7\u3055\u305b\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; caps <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'[A-N|P-Z]{2,}'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, comma)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30d4\u30ea\u30aa\u30c9\u3092\u53d6\u308a\u9664\u3044\u3066\u3001NE\u3068NW\u306e\u30a4\u30f3\u30c7\u30a3\u30b1\u30fc\u30bf\u3092\u6574\u5f62\u3059\u308b(\u4e0a\u3067\u5b9a\u7fa9\u3057\u305fVERBOSE\u30e2\u30fc\u30c9\u3092\u5229\u7528\u3059\u308b)<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ne <strong>=<\/strong> re<strong>.<\/strong>sub(ne_pattern, <font style=\"color:#ff0000\" class=\"has-text-color\">' NE'<\/font>, caps)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nw <strong>=<\/strong> re<strong>.<\/strong>sub(nw_pattern, <font style=\"color:#ff0000\" class=\"has-text-color\">' NW'<\/font>, ne)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30e9\u30b9\u30c8\u30cd\u30fc\u30e0\u3068\u30d5\u30a1\u30fc\u30b9\u30c8\u30cd\u30fc\u30e0\u306e\u9593\u306b\u3042\u308b\u30d4\u30ea\u30aa\u30c9\u3092\u30b3\u30f3\u30de\u306b\u5909\u63db(Chace, Cockrell\u306e\u500b\u6240)<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; match <strong>=<\/strong> re<strong>.<\/strong>search(<font style=\"color:#ff0000\" class=\"has-text-color\">'^([A-Z][a-z]+\u00a5.)'<\/font>, nw)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <strong>if<\/strong> match:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; names <strong>=<\/strong> re<strong>.<\/strong>sub(<font style=\"color:#ff0000\" class=\"has-text-color\">'\u00a5.'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">','<\/font>, nw)\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <strong>else<\/strong>:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; names <strong>=<\/strong> nw\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u30eb\u30fc\u30d7\u3057\u3066\u3044\u308b\u9593\u3001\u5404\u884c\u3092\u30ea\u30b9\u30c8CleanText\u306b\u8ffd\u52a0\u3059\u308b<\/em><\/font>\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; CleanText<strong>.<\/strong>append(names)\n &nbsp;\n <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#\u201c\u30d5\u30a7\u30a4\u30af\u201d\u306eCSV\u30d5\u30a1\u30a4\u30eb\u306b\u4fdd\u5b58\u3059\u308b&nbsp;&nbsp; <\/em><\/font>\n <strong>with<\/strong> <font style=\"color:#0000ff\" class=\"has-text-color\">open<\/font>(<font style=\"color:#ff0000\" class=\"has-text-color\">'cdocr2\/50-1p1.csv'<\/font>, <font style=\"color:#ff0000\" class=\"has-text-color\">'w'<\/font>) <strong>as<\/strong> fcsv:\n &nbsp;&nbsp;&nbsp; <font style=\"color:#aa9857\" class=\"has-text-color\"><em>#CleanText\u5185\u306e\u5404\u884c\u3092\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u8fbc\u3080<\/em><\/font>\n &nbsp;&nbsp;&nbsp; <strong>for<\/strong> line <strong>in<\/strong> CleanText:\n &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; fcsv<strong>.<\/strong>write(line) <\/pre>\n\n\n\n<p>\u6700\u5f8c\u306b\u3001\u6b63\u898f\u8868\u73fe\u306f\u81c6\u75c5\u8005\u306e\u305f\u3081\u306e\u3082\u306e\u3067\u306f\u306a\u3044\u3053\u3068\u3092\u4ed8\u3051\u52a0\u3048\u3066\u304a\u304d\u307e\u3059\u3002\u6b63\u898f\u8868\u73fe\u306f\u30d1\u30ef\u30d5\u30eb\u306a\u30c4\u30fc\u30eb\u3067\u3059\u3002\u3042\u306a\u305f\u306e\u30c7\u30fc\u30bf\u3092\u5b8c\u5168\u306b\u7834\u58ca\u3067\u304d\u308b\u307b\u3069\u30d1\u30ef\u30d5\u30eb\u306a\u306e\u3067\u3059\u3002\u3067\u3059\u306e\u3067\u3001\u5b9f\u969b\u306b\u6b63\u898f\u8868\u73fe\u3092\u884c\u3046\u5834\u5408\u306f\u3001\u5fc5\u305a\u30d5\u30a1\u30a4\u30eb\u3092\u30b3\u30d4\u30fc\u3057\u3066\u7df4\u7fd2\u3057\u5c11\u3057\u305a\u3064\u6163\u308c\u3066\u3044\u304d\u307e\u3057\u3087\u3046\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator\"\/>\n\n\n\n<p><strong>\u8457\u8005\u306b\u3064\u3044\u3066<\/strong><strong><\/strong><\/p>\n\n\n\n<p>Laura Turner O&#8217;Hara\u6c0f\u306fU.S. \u7c73\u56fd\u8846\u8b70\u9662\u6b74\u53f2\u90e8\uff08Office of the Historian\uff09\u306b\u52e4\u52d9\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator\"\/>\n\n\n\n<h5 class=\"wp-block-heading\">\u5f15\u7528\u306e\u969b\u306f\u3053\u3061\u3089\u3092\u3054\u5229\u7528\u304f\u3060\u3055\u3044<\/h5>\n\n\n\n<p>\uff1c\u539f\u8457\uff1e<br>\nLaura Turner O&#8217;Hara, &#8220;Cleaning OCR\u2019d text with Regular Expressions,&#8221;&nbsp;<em>The Programming Historian<\/em>&nbsp;2 (2013), <a href=\"https:\/\/programminghistorian.org\/en\/lessons\/cleaning-ocrd-text-with-regular-expressions\">https:\/\/programminghistorian.org\/en\/lessons\/cleaning-ocrd-text-with-regular-expressions<\/a>. <\/p>\n\n\n\n<p>\uff1c\u7ffb\u8a33\u8a18\u4e8b\uff1e<br>Laura Turner O&#8217;Hara\u8457. \u83ca\u6c60\u4fe1\u5f66\u8a33. \u6b63\u898f\u8868\u73fe\u3092\u5229\u7528\u3057\u305fOCR\u30c6\u30ad\u30b9\u30c8\u306e\u30af\u30ea\u30fc\u30cb\u30f3\u30b0\u624b\u6cd5, \u6771\u30a2\u30b8\u30a2DH\u30dd\u30fc\u30bf\u30eb. 2020. <a href=\"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/?p=75\">https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/?p=75<\/a>.<\/p>\n\n\n\n<a rel=\"license\" href=\"http:\/\/creativecommons.org\/licenses\/by\/4.0\/\"><img decoding=\"async\" alt=\"\u30af\u30ea\u30a8\u30a4\u30c6\u30a3\u30d6\u30fb\u30b3\u30e2\u30f3\u30ba\u30fb\u30e9\u30a4\u30bb\u30f3\u30b9\" style=\"border-width:0\" src=\"https:\/\/i.creativecommons.org\/l\/by\/4.0\/88x31.png\"><\/a><br>\u3053\u306e \u4f5c\u54c1 \u306f <a rel=\"license\" href=\"http:\/\/creativecommons.org\/licenses\/by\/4.0\/\">\u30af\u30ea\u30a8\u30a4\u30c6\u30a3\u30d6\u30fb\u30b3\u30e2\u30f3\u30ba \u8868\u793a 4.0 \u56fd\u969b \u30e9\u30a4\u30bb\u30f3\u30b9<\/a>\u306e\u4e0b\u306b\u63d0\u4f9b\u3055\u308c\u3066\u3044\u307e\u3059\u3002\n\n\n\n<hr class=\"wp-block-separator\"\/>\n\n\n\n<div class=\"fb-comments\" data-href=\"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/?p=75\" data-numposts=\"5\" data-width=\"100%\"><\/div>\n","protected":false},"excerpt":{"rendered":"<p>Laura Turner O&#8217;Hara \u30b9\u30ad\u30e3\u30f3\u753b\u50cf\u3092\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf<\/p>\n","protected":false},"author":1,"featured_media":204,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[2],"tags":[15,10,13,14,12],"_links":{"self":[{"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/posts\/75"}],"collection":[{"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=75"}],"version-history":[{"count":48,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/posts\/75\/revisions"}],"predecessor-version":[{"id":1072,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/posts\/75\/revisions\/1072"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=\/wp\/v2\/media\/204"}],"wp:attachment":[{"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=75"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=75"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.dh.ku-orcas.kansai-u.ac.jp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=75"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}