Hi,
Apologies if this seems dumb, I can do this in powershell, but have a requirement to use PA instead.
Essentially I have a 2 page PDF as a sample (some PDF will contain more pages, although could be jpg in same format as PDF occasionally) and the pages each have a number at the top that I need to extract from each page. Occasionally a page will not have a number at that top, dont care about these pages! The number will ALWAYS be 8 digits and will ALWAYS be at the absolute top of the page if that helps.
I have used "recognize text in an image or pdf document" and that spits out JSON in the below format. (8 digit numbers appear twice on each page, the top number is the easiest as it will always be first on each page)
[
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"page": 1,
"lines@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"lines": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "43431212",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.7324705882352941,
"top": 0.09742727272727274,
"width": 0.12194117647058833,
"height": 0.015645454545454532,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7324705882352941,
"y": 0.09742727272727274
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8544117647058824,
"y": 0.09742727272727274
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8544117647058824,
"y": 0.11307272727272727
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7324705882352941,
"y": 0.11307272727272727
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "I am the first page in the PDF. I am some junk text that I do not need, this",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.19141176470588236,
"top": 0.2245181818181818,
"width": 0.6360823529411764,
"height": 0.014481818181818179,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19141176470588236,
"y": 0.2245181818181818
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8274941176470588,
"y": 0.2245181818181818
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8274941176470588,
"y": 0.239
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19141176470588236,
"y": 0.239
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "can be ignored as it is not part of the process.",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.1905529411764706,
"top": 0.2461818181818182,
"width": 0.3952,
"height": 0.014490909090909082,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1905529411764706,
"y": 0.2461818181818182
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.5857529411764706,
"y": 0.2461818181818182
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.5857529411764706,
"y": 0.2606727272727273
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1905529411764706,
"y": 0.2606727272727273
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "This is another paragraph of text that can be ignored, it has no benefit",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.1897529411764706,
"top": 0.2777,
"width": 0.6111882352941177,
"height": 0.014481818181818151,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1897529411764706,
"y": 0.2777
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8009411764705883,
"y": 0.2777
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8009411764705883,
"y": 0.29218181818181815
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1897529411764706,
"y": 0.29218181818181815
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "whatsoever.",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.19016470588235296,
"top": 0.2998909090909091,
"width": 0.1059176470588235,
"height": 0.011609090909090902,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19016470588235296,
"y": 0.2998909090909091
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.29608235294117646,
"y": 0.2998909090909091
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.29608235294117646,
"y": 0.3115
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19016470588235296,
"y": 0.3115
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "43431212",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.23502352941176471,
"top": 0.7815181818181819,
"width": 0.12192941176470587,
"height": 0.015645454545454518,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.23502352941176471,
"y": 0.7815181818181819
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.3569529411764706,
"y": 0.7815181818181819
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.3569529411764706,
"y": 0.7971636363636364
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.23502352941176471,
"y": 0.7971636363636364
}
]
}
}
}
]
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"page": 2,
"lines@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"lines": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "12341234",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.7343764705882353,
"top": 0.12379090909090908,
"width": 0.1208588235294118,
"height": 0.01564545454545456,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7343764705882353,
"y": 0.12379090909090908
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8552352941176471,
"y": 0.12379090909090908
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8552352941176471,
"y": 0.13943636363636364
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7343764705882353,
"y": 0.13943636363636364
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "I am another page. I am some junk text that I do not need, this can be",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.19141176470588236,
"top": 0.25125454545454545,
"width": 0.6053176470588235,
"height": 0.01441818181818183,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19141176470588236,
"y": 0.25125454545454545
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7967294117647059,
"y": 0.25125454545454545
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.7967294117647059,
"y": 0.2656727272727273
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19141176470588236,
"y": 0.2656727272727273
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "ignored as it is not part of the process.",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.19101176470588235,
"top": 0.27285454545454546,
"width": 0.3325058823529412,
"height": 0.014481818181818151,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19101176470588235,
"y": 0.27285454545454546
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.5235176470588235,
"y": 0.27285454545454546
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.5235176470588235,
"y": 0.2873363636363636
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19101176470588235,
"y": 0.2873363636363636
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "This is another paragraph of text that can be ignored, it has no benefit",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.1897529411764706,
"top": 0.3045181818181818,
"width": 0.6111882352941177,
"height": 0.014481818181818207,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1897529411764706,
"y": 0.3045181818181818
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8009411764705883,
"y": 0.3045181818181818
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.8009411764705883,
"y": 0.319
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.1897529411764706,
"y": 0.319
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "whatsoever.",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.19016470588235296,
"top": 0.32655454545454543,
"width": 0.1059176470588235,
"height": 0.011609090909090958,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19016470588235296,
"y": 0.32655454545454543
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.29608235294117646,
"y": 0.32655454545454543
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.29608235294117646,
"y": 0.3381636363636364
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.19016470588235296,
"y": 0.3381636363636364
}
]
}
}
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"text": "12341234",
"boundingBox": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"left": 0.2369294117647059,
"top": 0.8081818181818182,
"width": 0.12085882352941174,
"height": 0.015645454545454407,
"polygon": {
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"coordinates@odata.type": "#Collection(Microsoft.Dynamics.CRM.crmbaseentity)",
"coordinates": [
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.2369294117647059,
"y": 0.8081818181818182
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.35778823529411763,
"y": 0.8081818181818182
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.35778823529411763,
"y": 0.8238272727272726
},
{
"@odata.type": "#Microsoft.Dynamics.CRM.expando",
"x": 0.2369294117647059,
"y": 0.8238272727272726
}
]
}
}
}
]
}
]
Then an apply to each to pick out the extracted text and append it to an array variable.

When I compose this, obviously all the text data including the body of the PDF is included in the array.

Is there any way to extract or even filter the array to just the first number from each page (hopefully without duplicates)? So it looks like;
[
"43431212",
"12341234"
]
Many thanks 🙂