# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"""
^
(?<frequency>[0-9]+) \W+
(?<word>\pL+(?:,\h\pL+|\W)*) \h+
(?<gender> [\pL()]+ (?:, \h* [\pL()]+)* ) \h+
(?<word_en> [^•]*[^•\s]) \h* \R
• \h*
(?<sent_esp> [^–]*[^\s–] ) \s*–\s*
(?<sent_en> .* (?:\R .*)*? ) \h* \R
(?<num1> [0-9]+) \h* \| \h*
(?<num2> .*\S)
"""
test_str = ("1\n\n"
"el, la art the (+m, f)\n"
"• el diccionario tenía también frases útiles – the dictionary also had\n"
"useful phrases\n"
"2055835 | 201481381\n\n"
"2\n\n"
"de prep of, from\n"
"• es el hijo de un amigo mío – he is the son of a friend of mine\n"
"1104364 | 133341468\n\n"
"3\n\n"
"que conj that, which\n"
"• dice que no quiere estudiar – he says that he doesn’t want to\n"
"study\n"
"543408 | 70319525\n\n"
"4\n\n"
"y conj and\n"
"• saben leer y escribir – they know how to read and write\n"
"524574 | 56511956\n\n"
"5\n\n"
"en prep in, on\n"
"• vivo en el segundo piso – I live on the second floor\n"
"496295 | 49340069\n\n"
"6\n\n"
"un art a, an\n"
"• era un hombre simpático – he was a nice man\n"
"381976 | 37389547\n\n"
"7\n\n"
"ser v to be (norm)\n"
"• la primera persona narrativa puede ser por el protagonista – the\n"
"first-person narrative can be through the protagonist\n"
"324390 | 40017214\n\n"
"8\n\n"
"a prep to, at\n"
"• me voy al extranjero por el verano – I am going abroad for the\n"
"summer\n"
"328540 | 48746618\n\n"
"9\n\n"
"él pron he, [ellos] them (m)\n"
"• él es bastante simpático – he is very nice\n"
"428383 | 2743194\n\n"
"10\n\n"
"lo art the (+neut)\n"
"• lo mejor es estudiar mucho – the best thing is to study a lot\n"
"346026 | 31016059\n\n"
"11\n\n"
"no adv no\n"
"• no tiene dinero – he has no money\n"
"198473 | 21970515 +o\n\n"
"12\n\n"
"su adj his/her/their/your (–fam)\n"
"• ¿Quién era? ¿Su hermana? ¿Su amiga? – Who was it? His sister?\n"
"Her friend?\n"
"173372 | 17883988\n\n"
"13\n\n"
"haber v to have (+Ved)\n"
"• no ha dicho nada – he hasn’t said a thing\n"
"155853 | 15650271\n\n"
"14\n\n"
"con prep with\n"
"• hay un hombre con ella – there is a man with her\n"
"163958 | 18316135\n\n"
"15\n\n"
"por prep by, for, through\n"
"• caminamos por el parque – we walked through the park\n"
"174450 | 17512892\n\n"
"16\n\n"
"para prep for, to, in order to\n"
"• ¡Tengo una sorpresa sensacional para ti! – I have a wonderful\n"
"surprise for you!\n"
"112694 | 16947562\n\n"
"17\n\n"
"mí pron me (obj prep)\n"
"• el regalo era para mí – the gift was for me\n"
"114124 | 608449 +o\n\n"
"18\n\n"
"lo pron [3rd person] (dir obj-m)\n"
"• lo compré en la tienda – I bought it at the store\n"
"101112 | 5126699 +o\n\n"
"19\n\n"
"tener v to have\n"
"• Mi padre tiene varios relojes antiguos. – My father has several\n"
"antique watches.\n"
"85865 | 11087983 +o\n\n"
"20\n\n"
"como conj like, as\n"
"• ser compositor en España es como ser torero en Finlandia –\n"
"being a composer in Spain is like being a bullfighter in Finland\n"
"98660 | 12480595\n\n"
"21\n\n"
"estar v to be (location, change from norm)\n"
"• él está en el trabajo – he is at work\n"
"88421 | 9574000\n\n"
"22\n\n"
"me pron me (obj)\n"
"• ¿Cuando me va a llamar? – When is he going to call me?\n"
"64024 | 8211349 +o\n\n"
"23\n\n"
"más adj more\n"
"• él necesitaba más dinero – he needed more money\n"
"86998 | 9003898\n\n"
"24\n\n"
"este adj this (m) [esta (f)]\n"
"• es igual en el este como en el oeste – it is the same in the east as\n"
"in the west\n"
"73920 | 13208363\n\n"
"25\n\n"
"le pron [3rd person] (indir obj)\n"
"• nunca le dijo la verdad – she never told him the truth\n"
"64833 | 7787199\n\n"
"26\n\n"
"hacer v to do, make\n"
"• he podido hacer lo que me gusta – I have been able to do what I\n"
"like\n"
"64199 | 8808576\n\n"
"27\n\n"
"se pron [“reflexive” marker]\n"
"• el saber de los geógrafos griegos se difundió por Europa – Greek\n"
"knowledge of geography spread throughout Europe\n"
"60393 | 28984360 +w\n\n"
"28\n\n"
"yo pron I (subj)\n"
"• ¡Yo soy el padre! – I am the father!\n"
"45822 | 10894793 +o +w\n\n"
"29\n\n"
"o conj or\n"
"• sí esperaba uno o dos muertos – yes, he expected one or two\n"
"deaths\n"
"71054 | 9100644\n\n"
"30\n\n"
"pero conj but, yet, except\n"
"• no significa nada para mí, pero no puedo olvidarla – it means\n"
"nothing to me, but I can’t forget it\n"
"66748 | 6871867\n\n"
"31\n\n"
"decir v to tell, say\n"
"• Parece que dice la verdad. – It seems like he is telling the truth.\n"
"61678 | 6016427 +o\n\n"
"32\n\n"
"poder v to be able to, can\n"
"• Con estos datos se puede tener una primera idea del\n"
"mecanismo. – With these facts we can begin to understand the\n"
"mechanism\n"
"61177 | 9248731\n\n"
"33\n\n"
"ir v to go\n"
"• ¿Quieres ir a la playa este fin de semana? – Do you want to go to\n"
"the beach this weekend?\n"
"52054 | 4435468 +o\n\n"
"34\n\n"
"ese adj that (m) [esa (f)]\n"
"• ¿Dónde viven esas mujeres? – Where do these women live?\n"
"48535 | 4683205 +o\n\n"
"35\n\n"
"otro adj other, another\n"
"• No comparte su taxi nunca conotrosviajeros a menos que les\n"
"conoce. – Don’t share your taxi with other travelers unless you\n"
"know them.\n"
"54762 | 5431865\n\n"
"36\n\n"
"si conj if, whether\n"
"• si quiere cazar, vamos a cazar – if he wants to hunt, let’s go\n"
"hunting\n"
"38328 | 6519137\n\n"
"37\n\n"
"mi adj my\n"
"• mi casa es su casa – my house is your house\n"
"34310 | 4766226\n\n"
"38\n\n"
"ver v to see\n"
"• había que subir para ver las ruinas – you have to climb up to see\n"
"the ruins\n"
"39015 | 4122615\n\n"
"39\n\n"
"ya adv already, still\n"
"• su marido ya ha dicho todo – her husband has already said it all\n"
"37257 | 3961260 +o\n\n"
"40\n\n"
"porque conj because\n"
"• lo vendo sólo porque tengo un problema muy grande – I am only\n"
"selling it because I am in a predicament\n"
"34109 | 3154550 +o\n\n"
"41\n\n"
"mucho adj much, many, a lot (ADV)\n"
"• por lo visto tienen mucho dinero – apparently they have a lot of\n"
"money\n"
"35069 | 4459289 +o\n\n"
"42\n\n"
"dar v to give\n"
"• me dio esta carta para ud – he gave me this letter for you\n"
"32999 | 3900029 +o\n\n"
"43\n\n"
"muy adv very, really\n"
"• está muy contento con mi trabajo – she is very happy with my\n"
"work\n"
"32854 | 3217244 +o\n\n"
"44\n\n"
"saber v to know (a fact), find out\n"
"• se necesita humildad para saber reconocer las propias faltas –\n"
"you need humility to know how to recognize your own faults\n"
"27238 | 2642079 +o\n\n"
"45\n\n"
"sí adv yes\n"
"• quiero una respuesta concreta: sí o no – I need a solid answer:\n"
"yes or no\n"
"31716 | 1100318 +o\n\n"
"46\n\n"
"año nm year\n"
"• no lo supo hasta casi un año después – he didn’t find out until\n"
"almost a year later\n"
"37168 | 3792004\n\n"
"47\n\n"
"ti pron you (obj prep-sg/+fam)\n"
"• no, la carta no es para ti – no, the letter is not for you\n"
"29283 | 366540 +o\n\n"
"48\n\n"
"te pron you (obj/+fam)\n"
"• ¿No te han hablado? – They haven’t spoken with you?\n"
"19928 | 4017417 +o\n\n"
"49\n\n"
"también adv also\n"
"• también habla italiano – she speaks Italian as well\n"
"31172 | 3209504\n\n"
"50\n\n"
"qué pron what?, which?, how (+ADJ)!\n"
"• no sé qué voy a hacer – I don’t know what I’m going to do\n"
"29722 | 2317175 +o\n\n"
"51\n\n"
"alguno adj some, a few\n"
"• habló algunas palabras con el agente de negocios – he spoke a\n"
"few words with the business agent\n"
"27224 | 3037373\n\n"
"52\n\n"
"nos pron us (obj)\n"
"• nos vio en la calle – he saw us on the street\n"
"17300 | 4445709 +o +w\n\n"
"53\n\n"
"tu adj your (+fam)\n"
"• ésta es tu casa y ésta es tu cama – this is your house and this is\n"
"your bed\n"
"8068 | 7260841 +w\n\n"
"54\n\n"
"sin prep without\n"
"• se habían quedado sin dinero – they were left without money\n"
"25332 | 2499874\n\n"
"55\n\n"
"mismo adj same\n"
"• pronunciando el mismo discurso en siete idiomas – giving the\n"
"same speech in seven languages\n"
"23973 | 3339470\n\n"
"56\n\n"
"eso pron that (n)\n"
"• y eso no es todo – and that isn’t everything\n"
"20972 | 2339488 +o\n\n"
"57\n\n"
"cuando adv when\n"
"• aquel libro es de mis nietos, cuando eran bebes todavía – that\n"
"book belonged to my grandchildren, when they were still babies\n"
"32690 | 71759\n\n"
"58\n\n"
"querer v to want, love\n"
"• quiero que este proceso salga con limpieza – I want this process\n"
"to go cleanly\n"
"20091 | 2897092\n\n"
"59\n\n"
"vez nf time (specific occurrence); en v. de: instead of\n"
"• es la primera vez que estoy junto al mar – this is the first time\n"
"that I have been near to the sea\n"
"27629 | 1558559\n\n"
"60\n\n"
"hasta prep until, up to, even (ADV)\n"
"• toda la noche, hasta las tres de la mañana – all night long, until\n"
"three in the morning\n"
"27638 | 2258791\n\n"
"61\n\n"
"la pron [3rd person] (dir obj-f)\n"
"• la puso en su bolsillo – he put it in his pocket\n"
"8190 | 7941346 +o +w\n\n"
"62\n\n"
"sobre prep on top of, over, about\n"
"• dejó el papel sobre la mesa y se fue – he left the paper on top of\n"
"the table and left\n"
"26590 | 3030210\n\n"
"63\n\n"
"entre prep between, among\n"
"• la cosa es entre tú y yo – the matter is between you and me\n"
"32379 | 2607736\n\n"
"64\n\n"
"dos num two\n"
"• la familia se reúne cada dos años – the family gets together every\n"
"two years\n"
"27487 | 2017874\n\n"
"65\n\n"
"día nm day\n"
"• cada día hay más problemas – every day there are more\n"
"problems\n"
"20735 | 2577473\n\n"
"66\n\n"
"grande adj large, great, big\n"
"• tiene los grandes ojos negros y bonitos – she has the biggest and\n"
"most beautiful black eyes\n"
"29813 | 2030849\n\n"
"67\n\n"
"así adv like that\n"
"• la vida es así – life is like that\n"
"21331 | 2321804\n\n"
"68\n\n"
"pasar v to pass, spend (time)\n"
"• no ha pasado ni un bus – not a single bus has passed by\n"
"20946 | 1905621\n\n"
"69\n\n"
"cosa nf thing\n"
"• estoy ya interesado en otra cosa – I am interested in a different\n"
"matter\n"
"17977 | 1903169 +o\n\n"
"70\n\n"
"desde prep from, since\n"
"• Lo habia pensado desde el principio que era una mal idea. – I\n"
"thought from the beginning that this was a bad idea.\n"
"24743 | 2424665\n\n"
"71\n\n"
"deber v should, ought to; to owe\n"
"• la cosa está en el lugar donde debe estar – the thing is in the\n"
"place where it should be\n"
"19965 | 3248053\n\n"
"72\n\n"
"ella pron she, [ellas] them (f)\n"
"• ella es muy estudiosa – she is very studious\n"
"18720 | 1491405\n\n"
"73\n\n"
"pues conj then, well then\n"
"• pues, venga usted cuando quiera – well then, come whenever\n"
"you want\n"
"18535 | 1210256 +o\n\n"
"74\n\n"
"entonces adv so, then\n"
"• vamos entonces a cambiarlo – then we are going to change it\n"
"19759 | 888406 +o\n\n"
"75\n\n"
"llegar v to arrive\n"
"• por fin llegamos al fondo de la catarata – we finally arrived at the\n"
"bottom of the waterfall\n"
"18877 | 1848469\n\n"
"76\n\n"
"poco adj little, few, a little bit (adv)\n"
"• trabajó poco tiempo con él – he worked a little while with him\n"
"18209 | 1845555\n\n"
"77\n\n"
"nuestro adj our\n"
"• no servía para nuestro país porque somos diferentes – it was no\n"
"good for our contry because we’re different\n"
"13411 | 3496070 +w\n\n"
"78\n\n"
"bien adv well\n"
"• el número de bienes que pueda poseer en un momento dado –\n"
"the number of goods one possesses in a given moment\n"
"16402 | 1959560 +o\n\n"
"79\n\n"
"ni conj not even, neither, nor\n"
"• pero ni eso me tranquilizó – but not even that reassured me\n"
"16340 | 1927472\n\n"
"80\n\n"
"tiempo nm time (general)\n"
"• ha estado mucho tiempo con ella – he has been with her a long\n"
"time\n"
"18408 | 1921017\n\n"
"81\n\n"
"ahora adv now\n"
"• lo importante ahora es contarte lo que le pasó – the important\n"
"thing now is to tell you what happened\n"
"17072 | 1591817 +o\n\n"
"82\n\n"
"primero adj first\n"
"• Primero, vamos a ir al supermercado. – First, we are going to go\n"
"to the grocery store.\n"
"20758 | 2484553\n\n"
"83\n\n"
"creer v to believe, think\n"
"• creo en la justicia de Dios – I believe in the justice of God\n"
"15928 | 1685972 +o\n\n"
"84\n\n"
"donde adv where\n"
"• no sé donde está la llave – I don’t know where the key is\n"
"17601 | 2151872\n\n"
"85\n\n"
"vida nf life\n"
"• ha dedicado toda la vida a la música – she has dedicated her life\n"
"to music\n"
"15149 | 2424426\n\n"
"86\n\n"
"dejar v to let, leave\n"
"• ella no dejó que yo lo olvidara – she didn’t let me forget it\n"
"14241 | 1881239\n\n"
"87\n\n"
"nada pron nothing, (not) at all\n"
"• no hay nada que hacer – there is nothing to do\n"
"14024 | 1498787 +o\n\n"
"88\n\n"
"tanto adj so much, so many\n"
"• no podía creer que haya tanta gente junta – I couldn’t believe\n"
"there were so many people together\n"
"16618 | 1960237\n\n"
"89\n\n"
"parecer v to seem, look like\n"
"• Mi burra es tan flaca parece un esqueleto. – My donkey is so\n"
"thin he looks like a skeleton.\n"
"15096 | 1330117\n\n"
"90\n\n"
"hablar v to speak, talk\n"
"• algunos hablan quechua, español, e inglés – some of them\n"
"speak Quechua, Spanish, and English\n"
"14325 | 1432852 +o\n\n"
"91\n\n"
"poner v to put (on), get (+ADJ)\n"
"• puso el cuchillo en la mano – he put the knife in his hand\n"
"14619 | 1657298\n\n"
"92\n\n"
"parte nf part, portion\n"
"• el arte es una parte tan importante de la cultura – art is such an\n"
"important part of culture\n"
"20938 | 1744336\n\n"
"93\n\n"
"eh interj eh\n"
"• Te interesa esta bufanda, ¿eh? – You are interested in this scarf,\n"
"eh?\n"
"16011 | 33653 +o\n\n"
"94\n\n"
"nuevo adj new\n"
"• marcaba el comienzo de una nueva vida para mí – it marked the\n"
"beginning of a new life for me\n"
"19641 | 1964400\n\n"
"95\n\n"
"sólo adv only, just\n"
"• no siempre vestía ropa de mujer; sólo en carnaval – I don’t\n"
"always wear women’s clothes; only during carnival\n"
"15824 | 2043480\n\n"
"96\n\n"
"siempre adv always, forever\n"
"• siempre ha sido así – she has always been like that\n"
"13141 | 1671829\n\n"
"97\n\n"
"hombre nm man, mankind, husband\n"
"• yo soy un hombre de pocas necesidades – I am a man of few\n"
"needs\n"
"14773 | 1240825\n\n"
"98\n\n"
"bueno adv well . . .\n"
"• Bueno, ¿y ahora qué hacemos? – Well, so what are we going to\n"
"do now?\n"
"15729 | 1108838 +o\n\n"
"99\n\n"
"seguir v to follow, keep on\n"
"• no la puedo seguir porque habla demasiado rápido – I couldn’t\n"
"keep up because she was speaking so quickly\n"
"14081 | 1824937\n\n"
"100\n\n"
"quedar v to remain, stay\n"
"• El aire empezó a quedarse quieto. – The air began to remain still.\n"
"14040 | 1348837")
subst = "\\1\\t\\2\\t\\3\\t\\4\\t\\5\\t\\6\\t"
# You can manually specify the number of replacements by changing the 4th argument
result = re.sub(regex, subst, test_str, 0, re.UNICODE | re.VERBOSE | re.MULTILINE)
if result:
print (result)
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Please keep in mind that these code samples are automatically generated and are not guaranteed to work. If you find any syntax errors, feel free to submit a bug report. For a full regex reference for Python, please visit: https://docs.python.org/3/library/re.html